[rpms/firefox] rawhide: add ppc64le JIT -

public inbox for git-commits@fedoraproject.org
help / color / mirror / Atom feed

To: git-commits@fedoraproject.org
Subject: [rpms/firefox] rawhide: add ppc64le JIT
Date: Tue, 16 Jun 2026 13:10:58 GMT	[thread overview]
Message-ID: <178161545850.1.13214606391308119450.rpms-firefox-e99f0d4925ac@fedoraproject.org> (raw)

A new commit has been pushed.

Repo   : rpms/firefox
Branch : rawhide
Commit : e99f0d4925ac596ad75f2ae084620d36c44a85c2
Author : Dan Horák <dan@danny.cz>
Date   : 2026-06-16T13:10:34+00:00
Stats  : +42086/-0 in 4 file(s)
URL    : https://src.fedoraproject.org/rpms/firefox/c/e99f0d4925ac596ad75f2ae084620d36c44a85c2?branch=rawhide

Log:
add ppc64le JIT

---
diff --git a/0001-Add-VSX-instructions-for-SKIA.patch b/0001-Add-VSX-instructions-for-SKIA.patch
new file mode 100644
index 0000000..ac3a0d8
--- /dev/null
+++ b/0001-Add-VSX-instructions-for-SKIA.patch
@@ -0,0 +1,1347 @@
+From a47c991dbbfb709134737a54e8bbe7e0b1bce800 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Trung=20L=C3=AA?= <8@tle.id.au>
+Date: Fri, 12 Jun 2026 15:23:10 +1000
+Subject: [PATCH 1/3] Add VSX instructions for SKIA
+
+Adapted from work done by Raptor Engineering for chromium's vendored
+SKIA
+
+Co-authored-by: Timothy Pearson <tpearson@raptorengineering.com>
+---
+ gfx/skia/skia/src/base/SkVx.h                 |  58 +++-
+ gfx/skia/skia/src/core/SkBlitRow_D32.cpp      |  98 ++++++
+ gfx/skia/skia/src/core/SkBlitter_ARGB32.cpp   | 268 ++++++++++++++++
+ .../skia/src/opts/SkBitmapProcState_opts.h    | 164 ++++++++++
+ gfx/skia/skia/src/opts/SkBlitRow_opts.h       |  48 +++
+ .../skia/src/opts/SkRasterPipeline_opts.h     | 237 ++++++++++++++
+ gfx/skia/skia/src/opts/SkSwizzler_opts.inc    | 289 ++++++++++++++++++
+ 7 files changed, 1160 insertions(+), 2 deletions(-)
+
+diff --git a/gfx/skia/skia/src/base/SkVx.h b/gfx/skia/skia/src/base/SkVx.h
+index f87ca44d4af0..ed80c91fd38e 100644
+--- a/gfx/skia/skia/src/base/SkVx.h
++++ b/gfx/skia/skia/src/base/SkVx.h
+@@ -52,6 +52,8 @@
+         #include <arm_neon.h>
+     #elif defined(__wasm_simd128__)
+         #include <wasm_simd128.h>
++    #elif defined(SK_CPU_PPC) && defined(__VSX__)
++        #include <altivec.h>
+     #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
+         #include <lasxintrin.h>
+         #include <lsxintrin.h>
+@@ -509,6 +511,14 @@ SINT Vec<N,T> if_then_else(const Vec<N,M<T>>& cond, const Vec<N,T>& t, const Vec
+                                               sk_bit_cast<uint8x16_t>(e)));
+     }
+ #endif
++#if SKVX_USE_SIMD && defined(SK_CPU_PPC) && defined(__VSX__)
++    if constexpr (N*sizeof(T) == 16) {
++        return sk_bit_cast<Vec<N,T>>(
++                vec_sel(sk_bit_cast<__vector unsigned char>(e),
++                        sk_bit_cast<__vector unsigned char>(t),
++                        sk_bit_cast<__vector unsigned char>(cond)));
++    }
++#endif
+ #if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
+     if constexpr (N*sizeof(T) == 32) {
+         return sk_bit_cast<Vec<N,T>>(__lasx_xvbitsel_v(sk_bit_cast<__m256i>(e),
+@@ -579,6 +589,11 @@ SINT bool any(const Vec<N,T>& x) {
+                                                           sk_bit_cast<__m128i>(x)));
+         return retv[0] != 0b0000;
+     }
++#endif
++#if SKVX_USE_SIMD && defined(SK_CPU_PPC) && defined(__VSX__)
++    if constexpr (N*sizeof(T) == 16) {
++        return vec_any_ne(sk_bit_cast<__vector unsigned int>(x), vec_splats(0u));
++    }
+ #endif
+     return any(x.lo)
+         || any(x.hi);
+@@ -622,6 +637,11 @@ SINT bool all(const Vec<N,T>& x) {
+                                                           sk_bit_cast<__m128i>(x)));
+         return retv[0] == 0b1111;
+     }
++#endif
++#if SKVX_USE_SIMD && defined(SK_CPU_PPC) && defined(__VSX__)
++    if constexpr (N*sizeof(T) == 16) {
++        return vec_all_ne(sk_bit_cast<__vector unsigned int>(x), vec_splats(0u));
++    }
+ #endif
+     return all(x.lo)
+         && all(x.hi);
+@@ -647,8 +667,22 @@ SIT  T max(const Vec<1,T>& x) { return x.val; }
+ SINT T min(const Vec<N,T>& x) { return std::min(min(x.lo), min(x.hi)); }
+ SINT T max(const Vec<N,T>& x) { return std::max(max(x.lo), max(x.hi)); }
+ 
+-SINT Vec<N,T> min(const Vec<N,T>& x, const Vec<N,T>& y) { return naive_if_then_else(y < x, y, x); }
+-SINT Vec<N,T> max(const Vec<N,T>& x, const Vec<N,T>& y) { return naive_if_then_else(x < y, y, x); }
++SINT Vec<N,T> min(const Vec<N,T>& x, const Vec<N,T>& y) {
++#if SKVX_USE_SIMD && defined(SK_CPU_PPC) && defined(__VSX__)
++    if constexpr (N*sizeof(T) == 16) {
++        return sk_bit_cast<Vec<N,T>>(vec_min(to_vext(x), to_vext(y)));
++    }
++#endif
++    return naive_if_then_else(y < x, y, x);
++}
++SINT Vec<N,T> max(const Vec<N,T>& x, const Vec<N,T>& y) {
++#if SKVX_USE_SIMD && defined(SK_CPU_PPC) && defined(__VSX__)
++    if constexpr (N*sizeof(T) == 16) {
++        return sk_bit_cast<Vec<N,T>>(vec_max(to_vext(x), to_vext(y)));
++    }
++#endif
++    return naive_if_then_else(x < y, y, x);
++}
+ 
+ SINTU Vec<N,T> min(const Vec<N,T>& x, U y) { return min(x, Vec<N,T>(y)); }
+ SINTU Vec<N,T> max(const Vec<N,T>& x, U y) { return max(x, Vec<N,T>(y)); }
+@@ -960,6 +994,26 @@ SIN Vec<N,uint16_t> mulhi(const Vec<N,uint16_t>& x,
+     } else { // N > 8
+         return join(mulhi(x.lo, y.lo), mulhi(x.hi, y.hi));
+     }
++#elif SKVX_USE_SIMD && defined(SK_CPU_PPC) && defined(__VSX__)
++    if constexpr (N == 8) {
++        // u16*u16 -> u32 even/odd products (vmuleuh/vmulouh), then gather the
++        // high 16 bits of each back into sequential lanes. Same idiom as the
++        // VSX scale() in SkSwizzler_opts.
++        __vector unsigned short xs = sk_bit_cast<__vector unsigned short>(x);
++        __vector unsigned short ys = sk_bit_cast<__vector unsigned short>(y);
++        __vector unsigned int even = vec_vmuleuh(xs, ys);
++        __vector unsigned int odd  = vec_vmulouh(xs, ys);
++        const __vector unsigned char hi = {
++            0x02,0x03, 0x12,0x13,  0x06,0x07, 0x16,0x17,
++            0x0A,0x0B, 0x1A,0x1B,  0x0E,0x0F, 0x1E,0x1F
++        };
++        return sk_bit_cast<Vec<8,uint16_t>>(
++            vec_perm((__vector unsigned char)even, (__vector unsigned char)odd, hi));
++    } else if constexpr (N < 8) {
++        return mulhi(join(x,x), join(y,y)).lo;
++    } else { // N > 8
++        return join(mulhi(x.lo, y.lo), mulhi(x.hi, y.hi));
++    }
+ #else
+     return skvx::cast<uint16_t>(mull(x, y) >> 16);
+ #endif
+diff --git a/gfx/skia/skia/src/core/SkBlitRow_D32.cpp b/gfx/skia/skia/src/core/SkBlitRow_D32.cpp
+index bcbf2e66bd46..920d6a9b2366 100644
+--- a/gfx/skia/skia/src/core/SkBlitRow_D32.cpp
++++ b/gfx/skia/skia/src/core/SkBlitRow_D32.cpp
+@@ -517,6 +517,104 @@ static void blit_row_s32_opaque(SkPMColor* dst,
+         }
+     }
+ 
++#elif defined(SK_CPU_PPC) && defined(__VSX__)
++    #include <altivec.h>
++
++    // dst + (((src - dst) * src_scale) >> 8), splayed into 16-bit lanes; the
++    // vec_* transcription of SkPMLerp_SSE2.
++    static inline __vector unsigned char SkPMLerp_VSX(__vector unsigned char src,
++                                                      __vector unsigned char dst,
++                                                      unsigned src_scale) {
++        const __vector unsigned int mask = vec_splats(0x00FF00FFu);
++        const __vector unsigned short eight = vec_splats((unsigned short)8);
++        __vector unsigned short src_rb = (__vector unsigned short)vec_and((__vector unsigned int)src, mask);
++        __vector unsigned short src_ag = vec_sr((__vector unsigned short)src, eight);
++        __vector unsigned short dst_rb = (__vector unsigned short)vec_and((__vector unsigned int)dst, mask);
++        __vector unsigned short dst_ag = vec_sr((__vector unsigned short)dst, eight);
++        __vector unsigned short s = vec_splats((unsigned short)src_scale);
++        __vector unsigned short diff_rb = vec_mul(vec_sub(src_rb, dst_rb), s);
++        __vector unsigned short diff_ag = vec_mul(vec_sub(src_ag, dst_ag), s);
++        diff_rb = vec_sr(diff_rb, eight);
++        __vector unsigned int diff = vec_or((__vector unsigned int)diff_rb,
++                                            vec_andc((__vector unsigned int)diff_ag, mask));
++        return vec_add(dst, (__vector unsigned char)diff);
++    }
++
++    static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
++        SkASSERT(alpha <= 255);
++        unsigned src_scale = SkAlpha255To256(alpha);
++        while (count >= 4) {
++            __vector unsigned char s = vec_xl(0, (const unsigned char*)src);
++            __vector unsigned char d = vec_xl(0, (const unsigned char*)dst);
++            vec_xst(SkPMLerp_VSX(s, d, src_scale), 0, (unsigned char*)dst);
++            src += 4; dst += 4; count -= 4;
++        }
++        while (count --> 0) {
++            *dst = SkPMLerp(*src, *dst, src_scale);
++            src++;
++            dst++;
++        }
++    }
++
++    // The vec_* transcription of SkBlendARGB32_SSE2: scale src by aa and dst by
++    // SkAlphaMulInv256(srcA, aa), then add the splayed halves.
++    static inline __vector unsigned char SkBlendARGB32_VSX(__vector unsigned char src,
++                                                           __vector unsigned char dst,
++                                                           unsigned aa) {
++        unsigned alpha = SkAlpha255To256(aa);
++        __vector unsigned short src_scale = vec_splats((unsigned short)alpha);
++        const __vector unsigned int mask = vec_splats(0x00FF00FFu);
++        const __vector unsigned short eight = vec_splats((unsigned short)8);
++
++        // dst_scale = SkAlphaMulInv256(SkGetPackedA32(src), alpha), per 32-bit lane.
++        __vector unsigned int srcA = vec_sr((__vector unsigned int)src, vec_splats(24u));
++        __vector unsigned int ds = (__vector unsigned int)vec_mul((__vector unsigned short)srcA, src_scale);
++        ds = vec_sub(vec_splats((unsigned int)0xFFFF), ds);
++        ds = vec_add(ds, vec_sr(ds, vec_splats(8u)));
++        ds = vec_sr(ds, vec_splats(8u));
++        // Duplicate the low 16-bit word of each 32-bit lane into both halves
++        // (the SSE shufflelo/shufflehi _MM_SHUFFLE(2,2,0,0)).
++        const __vector unsigned char dup = (__vector unsigned char){
++            0,1,0,1, 4,5,4,5, 8,9,8,9, 12,13,12,13
++        };
++        __vector unsigned short dst_scale =
++            (__vector unsigned short)vec_perm((__vector unsigned char)ds,
++                                              (__vector unsigned char)ds, dup);
++
++        __vector unsigned short src_rb = (__vector unsigned short)vec_and((__vector unsigned int)src, mask);
++        __vector unsigned short src_ag = vec_sr((__vector unsigned short)src, eight);
++        __vector unsigned short dst_rb = (__vector unsigned short)vec_and((__vector unsigned int)dst, mask);
++        __vector unsigned short dst_ag = vec_sr((__vector unsigned short)dst, eight);
++
++        src_rb = vec_mul(src_rb, src_scale);
++        src_ag = vec_mul(src_ag, src_scale);
++        dst_rb = vec_mul(dst_rb, dst_scale);
++        dst_ag = vec_mul(dst_ag, dst_scale);
++
++        dst_rb = vec_add(src_rb, dst_rb);
++        dst_ag = vec_add(src_ag, dst_ag);
++
++        dst_rb = vec_sr(dst_rb, eight);
++        __vector unsigned int out = vec_or((__vector unsigned int)dst_rb,
++                                           vec_andc((__vector unsigned int)dst_ag, mask));
++        return (__vector unsigned char)out;
++    }
++
++    static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
++        SkASSERT(alpha <= 255);
++        while (count >= 4) {
++            __vector unsigned char s = vec_xl(0, (const unsigned char*)src);
++            __vector unsigned char d = vec_xl(0, (const unsigned char*)dst);
++            vec_xst(SkBlendARGB32_VSX(s, d, alpha), 0, (unsigned char*)dst);
++            src += 4; dst += 4; count -= 4;
++        }
++        while (count --> 0) {
++            *dst = SkBlendARGB32(*src, *dst, alpha);
++            src++;
++            dst++;
++        }
++    }
++
+ #else
+     static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
+         SkASSERT(alpha <= 255);
+diff --git a/gfx/skia/skia/src/core/SkBlitter_ARGB32.cpp b/gfx/skia/skia/src/core/SkBlitter_ARGB32.cpp
+index a7538027b85d..9669431292b6 100644
+--- a/gfx/skia/skia/src/core/SkBlitter_ARGB32.cpp
++++ b/gfx/skia/skia/src/core/SkBlitter_ARGB32.cpp
+@@ -480,6 +480,274 @@ static inline SkPMColor blend_lcd16_opaque(int srcR, int srcG, int srcB,
+         }
+     }
+ 
++#elif defined(SK_CPU_PPC) && defined(__VSX__)
++    #include <altivec.h>
++
++    // Native VSX/AltiVec port of the SSE2 LCD-subpixel blend block below.
++    // Same algorithm — only the intrinsics change. Translations follow the
++    // GCC ppc_wrappers pattern (vec_mergeh/l, vec_packsu, etc.).
++
++    // The following (left) shifts cause the top 5 bits of the mask components to
++    // line up with the corresponding components in an SkPMColor.
++    // Note that the mask's RGB16 order may differ from the SkPMColor order.
++    #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
++    #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
++    #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
++
++    // Each macro must always return __vector unsigned int so the surrounding
++    // vec_and gets matching element types. The pass-through case (SHIFT == 0)
++    // still needs an explicit reinterpret-cast since `mask` is __vector
++    // unsigned char in our function signature.
++    #if SK_R16x5_R32x5_SHIFT == 0
++        #define SkPackedR16x5ToUnmaskedR32x5_VSX(x) ((__vector unsigned int)(x))
++    #elif SK_R16x5_R32x5_SHIFT > 0
++        #define SkPackedR16x5ToUnmaskedR32x5_VSX(x) \
++            vec_sl((__vector unsigned int)(x), vec_splats((unsigned int)SK_R16x5_R32x5_SHIFT))
++    #else
++        #define SkPackedR16x5ToUnmaskedR32x5_VSX(x) \
++            vec_sr((__vector unsigned int)(x), vec_splats((unsigned int)(-SK_R16x5_R32x5_SHIFT)))
++    #endif
++
++    #if SK_G16x5_G32x5_SHIFT == 0
++        #define SkPackedG16x5ToUnmaskedG32x5_VSX(x) ((__vector unsigned int)(x))
++    #elif SK_G16x5_G32x5_SHIFT > 0
++        #define SkPackedG16x5ToUnmaskedG32x5_VSX(x) \
++            vec_sl((__vector unsigned int)(x), vec_splats((unsigned int)SK_G16x5_G32x5_SHIFT))
++    #else
++        #define SkPackedG16x5ToUnmaskedG32x5_VSX(x) \
++            vec_sr((__vector unsigned int)(x), vec_splats((unsigned int)(-SK_G16x5_G32x5_SHIFT)))
++    #endif
++
++    #if SK_B16x5_B32x5_SHIFT == 0
++        #define SkPackedB16x5ToUnmaskedB32x5_VSX(x) ((__vector unsigned int)(x))
++    #elif SK_B16x5_B32x5_SHIFT > 0
++        #define SkPackedB16x5ToUnmaskedB32x5_VSX(x) \
++            vec_sl((__vector unsigned int)(x), vec_splats((unsigned int)SK_B16x5_B32x5_SHIFT))
++    #else
++        #define SkPackedB16x5ToUnmaskedB32x5_VSX(x) \
++            vec_sr((__vector unsigned int)(x), vec_splats((unsigned int)(-SK_B16x5_B32x5_SHIFT)))
++    #endif
++
++    static __vector unsigned char blend_lcd16_vsx(__vector unsigned char& src,
++                                                   __vector unsigned char& dst,
++                                                   __vector unsigned char& mask,
++                                                   __vector unsigned char& srcA) {
++        // Get the R,G,B of each 16bit mask pixel, all aligned to 5-bit positions.
++        __vector unsigned int r = vec_and(SkPackedR16x5ToUnmaskedR32x5_VSX(mask),
++                                          vec_splats((unsigned int)(0x1F << SK_R32_SHIFT)));
++        __vector unsigned int g = vec_and(SkPackedG16x5ToUnmaskedG32x5_VSX(mask),
++                                          vec_splats((unsigned int)(0x1F << SK_G32_SHIFT)));
++        __vector unsigned int b = vec_and(SkPackedB16x5ToUnmaskedB32x5_VSX(mask),
++                                          vec_splats((unsigned int)(0x1F << SK_B32_SHIFT)));
++
++        // a needs to be either the min or the max of the LCD coverages, depending on srcA < dstA.
++        __vector unsigned int rA = vec_sl(r, vec_splats((unsigned int)(SK_A32_SHIFT - SK_R32_SHIFT)));
++        __vector unsigned int gA = vec_sl(g, vec_splats((unsigned int)(SK_A32_SHIFT - SK_G32_SHIFT)));
++        __vector unsigned int bA = vec_sl(b, vec_splats((unsigned int)(SK_A32_SHIFT - SK_B32_SHIFT)));
++        __vector unsigned char aMin = vec_min(vec_min((__vector unsigned char)rA,
++                                                       (__vector unsigned char)gA),
++                                              (__vector unsigned char)bA);
++        __vector unsigned char aMax = vec_max(vec_max((__vector unsigned char)rA,
++                                                       (__vector unsigned char)gA),
++                                              (__vector unsigned char)bA);
++        // srcA has been biased to [0-256]; compare srcA against (dstA+1).
++        __vector unsigned int dstA = vec_and(vec_add((__vector unsigned int)dst,
++                                                     vec_splats((unsigned int)(1 << SK_A32_SHIFT))),
++                                             vec_splats((unsigned int)SK_A32_MASK));
++        __vector __bool int aLT = vec_cmplt((__vector signed int)srcA, (__vector signed int)dstA);
++        // a = (aMin & aLT) | (aMax & ~aLT)
++        __vector unsigned char a = vec_or(vec_and(aMin, (__vector unsigned char)aLT),
++                                          vec_andc(aMax, (__vector unsigned char)aLT));
++
++        // Pack the 4 16-bit mask pixels into 4 32-bit pixels (m0A, m0R, m0G, m0B, ...).
++        mask = vec_or(vec_or(a, (__vector unsigned char)r),
++                      vec_or((__vector unsigned char)g, (__vector unsigned char)b));
++
++        // Interleave into 16-bit words.
++        const __vector unsigned char zeros = vec_splats((unsigned char)0);
++        __vector unsigned short maskLo = (__vector unsigned short)vec_mergeh(mask, zeros);
++        __vector unsigned short maskHi = (__vector unsigned short)vec_mergel(mask, zeros);
++
++        // Upscale 0..31 -> 0..32 by adding (mask >> 4).
++        const __vector unsigned short v4 = vec_splats((unsigned short)4);
++        const __vector unsigned short v8 = vec_splats((unsigned short)8);
++        const __vector unsigned short v5 = vec_splats((unsigned short)5);
++        maskLo = vec_add(maskLo, vec_sr(maskLo, v4));
++        maskHi = vec_add(maskHi, vec_sr(maskHi, v4));
++
++        // Multiply by srcA per 16-bit lane.
++        maskLo = vec_mul(maskLo, (__vector unsigned short)srcA);
++        maskHi = vec_mul(maskHi, (__vector unsigned short)srcA);
++        // Divide by 256 (right-shift 8).
++        maskLo = vec_sr(maskLo, v8);
++        maskHi = vec_sr(maskHi, v8);
++
++        // Unpack dst into 16-bit words.
++        __vector signed short dstLo = (__vector signed short)vec_mergeh(dst, zeros);
++        __vector signed short dstHi = (__vector signed short)vec_mergel(dst, zeros);
++        // mask = (src - dst) * mask
++        __vector signed short srcS = (__vector signed short)src;
++        __vector signed short mLoS = vec_mul((__vector signed short)maskLo, vec_sub(srcS, dstLo));
++        __vector signed short mHiS = vec_mul((__vector signed short)maskHi, vec_sub(srcS, dstHi));
++        // arithmetic shift right by 5
++        mLoS = vec_sra(mLoS, (__vector unsigned short)v5);
++        mHiS = vec_sra(mHiS, (__vector unsigned short)v5);
++        // result = dst + ((src - dst) * mask >> 5)
++        __vector signed short resLo = vec_add(dstLo, mLoS);
++        __vector signed short resHi = vec_add(dstHi, mHiS);
++        // Pack 16-bit signed -> 8-bit unsigned with saturation.
++        return vec_packsu(resLo, resHi);
++    }
++
++    static __vector unsigned char blend_lcd16_opaque_vsx(__vector unsigned char& src,
++                                                          __vector unsigned char& dst,
++                                                          __vector unsigned char& mask) {
++        __vector unsigned int r = vec_and(SkPackedR16x5ToUnmaskedR32x5_VSX(mask),
++                                          vec_splats((unsigned int)(0x1F << SK_R32_SHIFT)));
++        __vector unsigned int g = vec_and(SkPackedG16x5ToUnmaskedG32x5_VSX(mask),
++                                          vec_splats((unsigned int)(0x1F << SK_G32_SHIFT)));
++        __vector unsigned int b = vec_and(SkPackedB16x5ToUnmaskedB32x5_VSX(mask),
++                                          vec_splats((unsigned int)(0x1F << SK_B32_SHIFT)));
++
++        // Opaque src: a = max(r, g, b) shifted to alpha lane.
++        __vector unsigned int rA = vec_sl(r, vec_splats((unsigned int)(SK_A32_SHIFT - SK_R32_SHIFT)));
++        __vector unsigned int gA = vec_sl(g, vec_splats((unsigned int)(SK_A32_SHIFT - SK_G32_SHIFT)));
++        __vector unsigned int bA = vec_sl(b, vec_splats((unsigned int)(SK_A32_SHIFT - SK_B32_SHIFT)));
++        __vector unsigned char a = vec_max(vec_max((__vector unsigned char)rA,
++                                                    (__vector unsigned char)gA),
++                                           (__vector unsigned char)bA);
++
++        mask = vec_or(vec_or(a, (__vector unsigned char)r),
++                      vec_or((__vector unsigned char)g, (__vector unsigned char)b));
++
++        const __vector unsigned char zeros = vec_splats((unsigned char)0);
++        __vector unsigned short maskLo = (__vector unsigned short)vec_mergeh(mask, zeros);
++        __vector unsigned short maskHi = (__vector unsigned short)vec_mergel(mask, zeros);
++
++        const __vector unsigned short v4 = vec_splats((unsigned short)4);
++        const __vector unsigned short v5 = vec_splats((unsigned short)5);
++        maskLo = vec_add(maskLo, vec_sr(maskLo, v4));
++        maskHi = vec_add(maskHi, vec_sr(maskHi, v4));
++
++        __vector signed short dstLo = (__vector signed short)vec_mergeh(dst, zeros);
++        __vector signed short dstHi = (__vector signed short)vec_mergel(dst, zeros);
++        __vector signed short srcS = (__vector signed short)src;
++        __vector signed short mLoS = vec_mul((__vector signed short)maskLo, vec_sub(srcS, dstLo));
++        __vector signed short mHiS = vec_mul((__vector signed short)maskHi, vec_sub(srcS, dstHi));
++        mLoS = vec_sra(mLoS, (__vector unsigned short)v5);
++        mHiS = vec_sra(mHiS, (__vector unsigned short)v5);
++        __vector signed short resLo = vec_add(dstLo, mLoS);
++        __vector signed short resHi = vec_add(dstHi, mHiS);
++        return vec_packsu(resLo, resHi);
++    }
++
++    void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src,
++                        int width, SkPMColor) {
++        if (width <= 0) {
++            return;
++        }
++        int srcA = SkColorGetA(src);
++        int srcR = SkColorGetR(src);
++        int srcG = SkColorGetG(src);
++        int srcB = SkColorGetB(src);
++        srcA = SkAlpha255To256(srcA);
++
++        if (width >= 4) {
++            SkASSERT(SkIsAlign4((uintptr_t) dst));
++            while (!SkIsAlign16((uintptr_t) dst)) {
++                *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
++                mask++; dst++; width--;
++            }
++
++            // Replicate source across 4 lanes, then unpack low half to interleaved 16-bit.
++            uint32_t srcPM = SkPackARGB32(0xFF, srcR, srcG, srcB);
++            __vector unsigned int src_v32 = vec_splats(srcPM);
++            const __vector unsigned char zeros = vec_splats((unsigned char)0);
++            __vector unsigned char src_v = vec_mergeh((__vector unsigned char)src_v32, zeros);
++            __vector unsigned char srcA_v = (__vector unsigned char)vec_splats((unsigned short)srcA);
++
++            while (width >= 4) {
++                __vector unsigned char dst_v = vec_xl(0, (const unsigned char*)dst);
++                // Load 8 bytes (4x uint16 mask) into low half of vector.
++                uint64_t mlo;
++                memcpy(&mlo, mask, sizeof(mlo));
++                __vector unsigned long long mask_low =
++                    (__vector unsigned long long){mlo, 0};
++                __vector unsigned char mask_v = (__vector unsigned char)mask_low;
++
++                // Check if all mask values are zero (skip blending if so).
++                if (!vec_all_eq((__vector unsigned long long)mask_v,
++                                vec_splats((unsigned long long)0))) {
++                    // Unpack low 8 bytes of mask (4x uint16) into 4x uint32 (with zeros).
++                    // Zero-extend the 4 uint16 masks to 4 uint32 (16-bit-granularity
++                    // merge, matching SSE2's _mm_unpacklo_epi16); a char-granularity
++                    // merge would byte-stretch the RGB565 value and misplace the shifts.
++                    mask_v = (__vector unsigned char)vec_mergeh((__vector unsigned short)mask_v,
++                                                               (__vector unsigned short)zeros);
++                    __vector unsigned char result =
++                        blend_lcd16_vsx(src_v, dst_v, mask_v, srcA_v);
++                    vec_xst(result, 0, (unsigned char*)dst);
++                }
++                dst += 4; mask += 4; width -= 4;
++            }
++        }
++
++        while (width > 0) {
++            *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
++            mask++; dst++; width--;
++        }
++    }
++
++    void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
++                               SkColor src, int width, SkPMColor opaqueDst) {
++        if (width <= 0) {
++            return;
++        }
++        int srcR = SkColorGetR(src);
++        int srcG = SkColorGetG(src);
++        int srcB = SkColorGetB(src);
++
++        if (width >= 4) {
++            SkASSERT(SkIsAlign4((uintptr_t) dst));
++            while (!SkIsAlign16((uintptr_t) dst)) {
++                *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
++                mask++; dst++; width--;
++            }
++
++            uint32_t srcPM = SkPackARGB32(0xFF, srcR, srcG, srcB);
++            __vector unsigned int src_v32 = vec_splats(srcPM);
++            const __vector unsigned char zeros = vec_splats((unsigned char)0);
++            __vector unsigned char src_v = vec_mergeh((__vector unsigned char)src_v32, zeros);
++
++            while (width >= 4) {
++                __vector unsigned char dst_v = vec_xl(0, (const unsigned char*)dst);
++                uint64_t mlo;
++                memcpy(&mlo, mask, sizeof(mlo));
++                __vector unsigned long long mask_low =
++                    (__vector unsigned long long){mlo, 0};
++                __vector unsigned char mask_v = (__vector unsigned char)mask_low;
++
++                if (!vec_all_eq((__vector unsigned long long)mask_v,
++                                vec_splats((unsigned long long)0))) {
++                    // Zero-extend the 4 uint16 masks to 4 uint32 (16-bit-granularity
++                    // merge, matching SSE2's _mm_unpacklo_epi16); a char-granularity
++                    // merge would byte-stretch the RGB565 value and misplace the shifts.
++                    mask_v = (__vector unsigned char)vec_mergeh((__vector unsigned short)mask_v,
++                                                               (__vector unsigned short)zeros);
++                    __vector unsigned char result =
++                        blend_lcd16_opaque_vsx(src_v, dst_v, mask_v);
++                    vec_xst(result, 0, (unsigned char*)dst);
++                }
++                dst += 4; mask += 4; width -= 4;
++            }
++        }
++
++        while (width > 0) {
++            *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
++            mask++; dst++; width--;
++        }
++    }
++
+ #elif defined(SK_ARM_HAS_NEON)
+     #include <arm_neon.h>
+ 
+diff --git a/gfx/skia/skia/src/opts/SkBitmapProcState_opts.h b/gfx/skia/skia/src/opts/SkBitmapProcState_opts.h
+index 6d01a2f4458f..87b160ed7a1e 100644
+--- a/gfx/skia/skia/src/opts/SkBitmapProcState_opts.h
++++ b/gfx/skia/skia/src/opts/SkBitmapProcState_opts.h
+@@ -29,6 +29,8 @@
+     #include <lasxintrin.h>
+ #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
+     #include <lsxintrin.h>
++#elif defined(SK_CPU_PPC) && defined(__VSX__)
++    #include <altivec.h>
+ #endif
+ 
+ namespace SK_OPTS_NS {
+@@ -260,6 +262,168 @@ static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, O
+         }
+     }
+ 
++#elif defined(SK_CPU_PPC) && defined(__VSX__)
++
++    // Helper: scalar uint32_t -> 16-byte vector with x in low 32 bits, zero elsewhere.
++    // Equivalent of x86's _mm_cvtsi32_si128.
++    static inline __vector unsigned char vsx_cvt_u32_to_vec(uint32_t x) {
++        __vector unsigned int v = (__vector unsigned int){x, 0, 0, 0};
++        return (__vector unsigned char)v;
++    }
++
++    // Helper: PPC64 VSX equivalent of x86's _mm_maddubs_epi16. Multiplies pairs of
++    // (unsigned byte, signed byte) and adds adjacent pairs to produce 16-bit signed
++    // values, saturating to int16. Implementation transcribes the GCC ppc_wrappers
++    // tmmintrin.h sequence for endianness correctness on LE PPC64.
++    static inline __vector signed short vsx_maddubs_epi16(__vector unsigned char A,
++                                                            __vector signed char B) {
++        __vector signed short __ff = vec_splats((signed short)0x00FF);
++        __vector signed short __C = vec_and(vec_unpackh((__vector signed char)A), __ff);
++        __vector signed short __D = vec_and(vec_unpackl((__vector signed char)A), __ff);
++        __vector signed short __E = vec_unpackh(B);
++        __vector signed short __F = vec_unpackl(B);
++        __C = vec_mul(__C, __E);
++        __D = vec_mul(__D, __F);
++        const __vector unsigned char __odds  = (__vector unsigned char){
++            0,1, 4,5, 8,9, 12,13,  16,17, 20,21, 24,25, 28,29
++        };
++        const __vector unsigned char __evens = (__vector unsigned char){
++            2,3, 6,7, 10,11, 14,15,  18,19, 22,23, 26,27, 30,31
++        };
++        __E = (__vector signed short)vec_perm((__vector unsigned char)__C,
++                                              (__vector unsigned char)__D, __odds);
++        __F = (__vector signed short)vec_perm((__vector unsigned char)__C,
++                                              (__vector unsigned char)__D, __evens);
++        return vec_adds(__E, __F);
++    }
++
++    /*not static*/ inline
++    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
++                                 const uint32_t* xy, int count, uint32_t* colors) {
++        SkASSERT(count > 0 && colors != nullptr);
++        SkASSERT(s.fBilerp);
++        SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
++        SkASSERT(s.fAlphaScale <= 256);
++
++        // interpolate_in_x() is the crux of the implementation, interpolating in X
++        // for up to two output pixels (A and B) using vsx_maddubs_epi16().
++        auto interpolate_in_x = [](uint32_t A0, uint32_t A1,
++                                   uint32_t B0, uint32_t B1,
++                                   __vector signed char interlaced_x_weights) {
++            // _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1))
++            // = vec_mergeh on uchar, since the input vectors have only the low 32 bits set.
++            __vector unsigned char interlaced_A = vec_mergeh(vsx_cvt_u32_to_vec(A0),
++                                                              vsx_cvt_u32_to_vec(A1));
++            __vector unsigned char interlaced_B = vec_mergeh(vsx_cvt_u32_to_vec(B0),
++                                                              vsx_cvt_u32_to_vec(B1));
++            // _mm_unpacklo_epi64 = vec_mergeh on long long.
++            __vector long long lo64 = vec_mergeh((__vector long long)interlaced_A,
++                                                 (__vector long long)interlaced_B);
++            return vsx_maddubs_epi16((__vector unsigned char)lo64, interlaced_x_weights);
++        };
++
++        // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.
++        // Returns two pixels, with each color channel in a 16-bit lane of the result.
++        auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1,
++                                          uint32_t A2, uint32_t A3,
++                                          uint32_t B0, uint32_t B1,
++                                          uint32_t B2, uint32_t B3,
++                                          __vector signed char interlaced_x_weights,
++                                          int wy) {
++            __vector signed short top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights);
++            __vector signed short bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);
++
++            // 16*top + (bot-top)*wy, mirroring the SSE2 form (saves one multiply vs. the
++            // straightforward top*(16-wy) + bot*wy).
++            __vector unsigned short v4 = vec_splats((unsigned short)4);
++            __vector signed short wy_v = vec_splats((signed short)wy);
++            __vector signed short px = vec_add(vec_sl(top, v4), vec_mul(vec_sub(bot, top), wy_v));
++
++            // Scale down by total max weight 16x16 = 256.
++            px = (__vector signed short)vec_sr((__vector unsigned short)px, vec_splats((unsigned short)8));
++
++            // Scale by alpha if needed.
++            if (s.fAlphaScale < 256) {
++                __vector signed short scale_v = vec_splats((signed short)s.fAlphaScale);
++                px = (__vector signed short)vec_sr((__vector unsigned short)vec_mul(px, scale_v),
++                                                   vec_splats((unsigned short)8));
++            }
++            return px;
++        };
++
++        // We're in _DX mode here, so we're only varying in X.
++        // That means the first entry of xy is our constant pair of Y coordinates and weight in Y.
++        int y0, y1, wy;
++        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
++
++        auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()),
++             row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes());
++
++        while (count >= 4) {
++            // We can really get going, loading 4 X-pairs at a time to produce 4 output pixels.
++            int x0[4],
++                x1[4];
++            __vector unsigned int wx;
++
++            // decode_packed_coordinates_and_weight(), 4x.
++            __vector unsigned int packed = (__vector unsigned int)vec_xl(0, (const unsigned char*)xy);
++            __vector unsigned int x0_v = vec_sr(packed, vec_splats(18u));
++            __vector unsigned int x1_v = vec_and(packed, vec_splats(0x3fffu));
++            vec_xst((__vector unsigned char)x0_v, 0, (unsigned char*)x0);
++            vec_xst((__vector unsigned char)x1_v, 0, (unsigned char*)x1);
++            wx = vec_and(vec_sr(packed, vec_splats(14u)), vec_splats(0xfu));  // [0,15]
++
++            // Splat each x weight 4x (for each color channel) as wr for pixels on the right at x1,
++            // and sixteen minus that as wl for pixels on the left at x0.
++            const __vector unsigned char wr_mask = (__vector unsigned char){
++                0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12
++            };
++            __vector unsigned char wr = vec_perm((__vector unsigned char)wx,
++                                                  (__vector unsigned char)wx, wr_mask);
++            __vector unsigned char wl = vec_sub(vec_splats((unsigned char)16), wr);
++
++            // Interlace wl and wr for vsx_maddubs_epi16().
++            __vector signed char interlaced_x_weights_AB = (__vector signed char)vec_mergeh(wl, wr);
++            __vector signed char interlaced_x_weights_CD = (__vector signed char)vec_mergel(wl, wr);
++
++            enum { A,B,C,D };
++
++            __vector signed short AB = interpolate_in_x_and_y(
++                    row0[x0[A]], row0[x1[A]], row1[x0[A]], row1[x1[A]],
++                    row0[x0[B]], row0[x1[B]], row1[x0[B]], row1[x1[B]],
++                    interlaced_x_weights_AB, wy);
++            __vector signed short CD = interpolate_in_x_and_y(
++                    row0[x0[C]], row0[x1[C]], row1[x0[C]], row1[x1[C]],
++                    row0[x0[D]], row0[x1[D]], row1[x0[D]], row1[x1[D]],
++                    interlaced_x_weights_CD, wy);
++
++            // Pack 16-bit signed -> 8-bit unsigned with saturation, write 4 pixels.
++            __vector unsigned char packed_out = vec_packsu(AB, CD);
++            vec_xst(packed_out, 0, (unsigned char*)colors);
++            xy     += 4;
++            colors += 4;
++            count  -= 4;
++        }
++
++        while (count --> 0) {
++            // Same flow as the count >= 4 loop, but writing one pixel.
++            int x0, x1, wx;
++            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
++
++            __vector unsigned char wr = vec_splats((unsigned char)wx);
++            __vector unsigned char wl = vec_sub(vec_splats((unsigned char)16), wr);
++            __vector signed char interlaced_x_weights = (__vector signed char)vec_mergeh(wl, wr);
++
++            __vector signed short Av = interpolate_in_x_and_y(
++                    row0[x0], row0[x1], row1[x0], row1[x1],
++                    0, 0, 0, 0,
++                    interlaced_x_weights, wy);
++            __vector unsigned char packed_out = vec_packsu(Av,
++                    (__vector signed short)(__vector unsigned char){0});
++            *colors++ = ((__vector unsigned int)packed_out)[0];
++        }
++    }
++
+ #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
+     /*not static*/ inline
+     void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
+diff --git a/gfx/skia/skia/src/opts/SkBlitRow_opts.h b/gfx/skia/skia/src/opts/SkBlitRow_opts.h
+index d1de5681a72e..d03908a03a32 100644
+--- a/gfx/skia/skia/src/opts/SkBlitRow_opts.h
++++ b/gfx/skia/skia/src/opts/SkBlitRow_opts.h
+@@ -68,6 +68,43 @@
+     }
+ #endif
+ 
++#if defined(SK_CPU_PPC) && defined(__VSX__)
++    #include <altivec.h>
++
++    // Native VSX/AltiVec port of SkPMSrcOver_SSE2.
++    // Same algorithm: src + dst*(256-srcAlpha)/256.
++    static inline __vector unsigned char SkPMSrcOver_VSX(__vector unsigned char src,
++                                                         __vector unsigned char dst) {
++        __vector unsigned int src_u32 = (__vector unsigned int)src;
++        __vector unsigned int dst_u32 = (__vector unsigned int)dst;
++
++        // scale = 256 - (src >> 24)  (per 32-bit lane)
++        __vector unsigned int scale = vec_sub(vec_splats((unsigned int)256),
++                                              vec_sr(src_u32, vec_splats(24u)));
++        // scale_x2 = (scale << 16) | scale  -- splat the scale into both 16-bit halves
++        __vector unsigned int scale_x2 = vec_or(vec_sl(scale, vec_splats(16u)), scale);
++
++        const __vector unsigned int rb_mask = vec_splats(0x00FF00FFu);
++
++        // rb = (dst & 0x00FF00FF) * scale_x2 >> 8   (R and B channels in 16-bit lanes)
++        __vector unsigned short rb = (__vector unsigned short)vec_and(rb_mask, dst_u32);
++        rb = vec_mul(rb, (__vector unsigned short)scale_x2);
++        rb = vec_sr(rb, vec_splats((unsigned short)8));
++
++        // ga = (dst >> 8) * scale_x2  then mask out the rb channels
++        __vector unsigned short ga = vec_sr((__vector unsigned short)dst_u32,
++                                            vec_splats((unsigned short)8));
++        ga = vec_mul(ga, (__vector unsigned short)scale_x2);
++        // andc(ga, rb_mask) = ga & ~rb_mask  -- keep only G and A channels in 16-bit lanes
++        __vector unsigned int ga_u32 = vec_andc((__vector unsigned int)ga, rb_mask);
++
++        // result = src + adds_epu8(rb | ga)
++        __vector unsigned char merged =
++            (__vector unsigned char)vec_or((__vector unsigned int)rb, ga_u32);
++        return vec_adds(src, merged);
++    }
++#endif
++
+ #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+     #include <immintrin.h>
+ 
+@@ -176,6 +213,17 @@ inline void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len,
+     }
+ #endif
+ 
++#if defined(SK_CPU_PPC) && defined(__VSX__)
++    while (len >= 4) {
++        __vector unsigned char vsrc = vec_xl(0, (const unsigned char*)src);
++        __vector unsigned char vdst = vec_xl(0, (const unsigned char*)dst);
++        vec_xst(SkPMSrcOver_VSX(vsrc, vdst), 0, (unsigned char*)dst);
++        src += 4;
++        dst += 4;
++        len -= 4;
++    }
++#endif
++
+ #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+     while (len >= 4) {
+         _mm_storeu_si128((__m128i*)dst, SkPMSrcOver_SSE2(_mm_loadu_si128((const __m128i*)src),
+diff --git a/gfx/skia/skia/src/opts/SkRasterPipeline_opts.h b/gfx/skia/skia/src/opts/SkRasterPipeline_opts.h
+index 695b71434f8c..e2af0b94f392 100644
+--- a/gfx/skia/skia/src/opts/SkRasterPipeline_opts.h
++++ b/gfx/skia/skia/src/opts/SkRasterPipeline_opts.h
+@@ -87,6 +87,8 @@ using NoCtx = const void*;
+     #define SKRP_CPU_SCALAR
+ #elif defined(SK_ARM_HAS_NEON)
+     #define SKRP_CPU_NEON
++#elif defined(SK_CPU_PPC) && defined(__VSX__)
++    #define SKRP_CPU_VSX
+ #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
+     #define SKRP_CPU_SKX
+ #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
+@@ -109,6 +111,8 @@ using NoCtx = const void*;
+     #include <math.h>
+ #elif defined(SKRP_CPU_NEON)
+     #include <arm_neon.h>
++#elif defined(SKRP_CPU_VSX)
++    #include <altivec.h>
+ #elif defined(SKRP_CPU_LASX)
+     #include <lasxintrin.h>
+     #include <lsxintrin.h>
+@@ -337,6 +341,239 @@ namespace SK_OPTS_NS {
+         vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
+     }
+ 
++#elif defined(SKRP_CPU_VSX)
++    // Reuse the file-scope Vec<N,T> defined above. It already handles the
++    // GCC-vs-Clang divergence (ext_vector_type on Clang; vector_size via
++    // VecHelper on GCC) and produces the right vector-register-passing ABI
++    // on PPC64. The vec_* intrinsics in <altivec.h> accept either form.
++    template <typename T> using V = Vec<4, T>;
++    using F   = V<float   >;
++    using I32 = V< int32_t>;
++    using U64 = V<uint64_t>;
++    using U32 = V<uint32_t>;
++    using U16 = V<uint16_t>;
++    using U8  = V<uint8_t >;
++
++    // We polyfill a few routines that Clang doesn't build into ext_vector_types.
++    SI F   min(F a, F b)     { return vec_min(a,b); }
++    SI I32 min(I32 a, I32 b) { return vec_min(a,b); }
++    SI U32 min(U32 a, U32 b) { return vec_min(a,b); }
++    SI F   max(F a, F b)     { return vec_max(a,b); }
++    SI I32 max(I32 a, I32 b) { return vec_max(a,b); }
++    SI U32 max(U32 a, U32 b) { return vec_max(a,b); }
++
++    SI F   abs_  (F v)   { return vec_abs(v); }
++    SI I32 abs_  (I32 v) { return vec_abs(v); }
++    SI F   rcp_approx(F v) { return vec_re(v); }
++    SI F   rcp_precise (F v) { F e = rcp_approx(v); return e * (2.0f - v * e); }
++    SI F   rsqrt_approx (F v)   { return vec_rsqrte(v); }
++
++    SI U16 pack(U32 v)       { return __builtin_convertvector(v, U16); }
++    SI U8  pack(U16 v)       { return __builtin_convertvector(v,  U8); }
++
++    SI F if_then_else(I32 c, F t, F e) {
++        return vec_or((__vector float)vec_and((__vector float)c, (__vector float)t), (__vector float)vec_andc((__vector float)e, (__vector float)c));
++    }
++    SI I32 if_then_else(I32 c, I32 t, I32 e) {
++        return (I32)vec_or((__vector unsigned int)vec_and((__vector unsigned int)c, (__vector unsigned int)t), (__vector unsigned int)vec_andc((__vector unsigned int)e, (__vector unsigned int)c));
++    }
++
++    // In both AltiVec and SSE there is no horizontal element compare, unlike ARM.  Fall back to scalar operations here...
++    SI bool any(I32 c) {
++        if (vec_extract((U32)c, 0) != 0) return 1;
++        if (vec_extract((U32)c, 1) != 0) return 1;
++        if (vec_extract((U32)c, 2) != 0) return 1;
++        if (vec_extract((U32)c, 3) != 0) return 1;
++        return 0;
++    }
++    SI bool all(I32 c) {
++        if (vec_extract((U32)c, 0) == 0) return 0;
++        if (vec_extract((U32)c, 1) == 0) return 0;
++        if (vec_extract((U32)c, 2) == 0) return 0;
++        if (vec_extract((U32)c, 3) == 0) return 0;
++        return 1;
++    }
++
++    SI F     mad(F f, F m, F a) { return vec_madd(f,m,a); }
++    SI F    nmad(F f, F m, F a) { return vec_nmsub(f,m,a); }
++    SI F  floor_(F v) { return vec_floor(v); }
++    SI F   ceil_(F v) { return vec_ceil(v); }
++    SI F   sqrt_(F v) { return vec_sqrt(v); }
++    SI I32 iround(F v) { return vec_cts((__vector float)vec_rint(v), 0); }
++    SI U32 round(F v)  { return vec_ctu((__vector float)vec_rint(v), 0); }
++    SI U32 round(F v, F scale) { return (U32)vec_cts((__vector float)vec_rint(v*scale), 0); }
++
++    template <typename T>
++    SI V<T> gather(const T* p, U32 ix) {
++        return V<T>{p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]};
++    }
++    template <typename T>
++    SI V<T> gather_unaligned(const T* ptr, U32 ix) {
++        // This tells the compiler ptr might not be aligned appropriately, so
++        // it generates better assembly.
++        typedef T __attribute__ ((aligned (1))) unaligned_ptr;
++        const unaligned_ptr* uptr = static_cast<const unaligned_ptr*>(ptr);
++        return V<T>{uptr[ix[0]], uptr[ix[1]], uptr[ix[2]], uptr[ix[3]]};
++    }
++    template <typename V, typename S>
++    SI void scatter_masked(V src, S* dst, U32 ix, I32 mask) {
++        V before = gather(dst, ix);
++        V after = if_then_else(mask, src, before);
++        dst[ix[0]] = after[0];
++        dst[ix[1]] = after[1];
++        dst[ix[2]] = after[2];
++        dst[ix[3]] = after[3];
++    }
++
++    // Native VSX/AltiVec ports of the load2/store2/load3/load4/store4 helpers.
++    // Each uses vec_xl/vec_xst for unaligned 16-byte loads/stores, vec_mergeh/
++    // vec_mergel for SSE-style epi16/epi32/ps unpack ops, and vec_perm with a
++    // byte-mask for the SSE shufflelo/shufflehi/shuffle/srli_si128 ops. The
++    // PPC64 LE register-to-memory byte order matches x86 LE, so the byte-mask
++    // patterns are identical to the corresponding _mm_setr_epi8 forms.
++
++    SI void load2(const uint16_t* ptr, U16* r, U16* g) {
++        // Load 8 uint16: r0 g0 r1 g1 r2 g2 r3 g3 (in LE memory order).
++        __vector unsigned char v = vec_xl(0, (const unsigned char*)ptr);
++        // Extract every-other 16-bit value via vec_perm (high half of result is unused
++        // but written; sk_unaligned_load below picks up the low 8 bytes).
++        const __vector unsigned char r_mask = (__vector unsigned char){
++            0,1, 4,5, 8,9, 12,13,  0,0,0,0,0,0,0,0
++        };
++        const __vector unsigned char g_mask = (__vector unsigned char){
++            2,3, 6,7, 10,11, 14,15,  0,0,0,0,0,0,0,0
++        };
++        __vector unsigned char R_v = vec_perm(v, v, r_mask);
++        __vector unsigned char G_v = vec_perm(v, v, g_mask);
++        *r = sk_unaligned_load<U16>(&R_v);
++        *g = sk_unaligned_load<U16>(&G_v);
++    }
++
++    SI void store2(uint16_t* ptr, U16 r, U16 g) {
++        // Interleave: rg = r0 g0 r1 g1 r2 g2 r3 g3.
++        // r and g are 8-byte vectors; widen to 16 and vec_mergeh on ushort takes
++        // the low 4 lanes of each.
++        __vector unsigned short rw = widen_cast<__vector unsigned short>(r);
++        __vector unsigned short gw = widen_cast<__vector unsigned short>(g);
++        __vector unsigned short rg = vec_mergeh(rw, gw);
++        vec_xst((__vector unsigned char)rg, 0, (unsigned char*)ptr);
++    }
++
++    SI void load3(const uint16_t* ptr, U16* r, U16* g, U16* b) {
++        // 4 pixels x 3 channels x 2 bytes = 24 bytes. Two 16-byte loads with overlap
++        // avoid reading past the 24-byte source.
++        __vector unsigned char v01 = vec_xl(0, (const unsigned char*)(ptr + 0));
++        __vector unsigned char v23_raw = vec_xl(0, (const unsigned char*)(ptr + 4));
++        const __vector unsigned char zero = vec_splats((unsigned char)0);
++        // v23 = v23_raw >> 4 bytes (drops the overlapping pixel-1 trailing R).
++        const __vector unsigned char shift4 = (__vector unsigned char){
++            4,5,6,7, 8,9,10,11, 12,13,14,15, 16,16,16,16
++        };
++        __vector unsigned char v23 = vec_perm(v23_raw, zero, shift4);
++        // _N holds R,G,B for pixel N in its lower 3 lanes. shift6 advances to the next pixel.
++        const __vector unsigned char shift6 = (__vector unsigned char){
++            6,7,8,9, 10,11,12,13, 14,15, 16,16, 16,16, 16,16
++        };
++        __vector unsigned char _0 = v01;
++        __vector unsigned char _1 = vec_perm(v01, zero, shift6);
++        __vector unsigned char _2 = v23;
++        __vector unsigned char _3 = vec_perm(v23, zero, shift6);
++        // De-interlace to R,G,B per the SSE flow.
++        __vector unsigned short _02 = vec_mergeh((__vector unsigned short)_0,
++                                                  (__vector unsigned short)_2);
++        __vector unsigned short _13 = vec_mergeh((__vector unsigned short)_1,
++                                                  (__vector unsigned short)_3);
++        __vector unsigned short R_v = vec_mergeh(_02, _13);
++        const __vector unsigned char shift8 = (__vector unsigned char){
++            8,9,10,11, 12,13,14,15, 16,16,16,16, 16,16,16,16
++        };
++        __vector unsigned char G_v = vec_perm((__vector unsigned char)R_v, zero, shift8);
++        __vector unsigned short B_v = vec_mergel(_02, _13);
++        *r = sk_unaligned_load<U16>(&R_v);
++        *g = sk_unaligned_load<U16>(&G_v);
++        *b = sk_unaligned_load<U16>(&B_v);
++    }
++
++    SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
++        __vector unsigned short v01 = (__vector unsigned short)
++            vec_xl(0, (const unsigned char*)ptr);            // r0 g0 b0 a0 r1 g1 b1 a1
++        __vector unsigned short v23 = (__vector unsigned short)
++            vec_xl(0, (const unsigned char*)(ptr + 8));      // r2 g2 b2 a2 r3 g3 b3 a3
++        __vector unsigned short _02 = vec_mergeh(v01, v23);  // r0 r2 g0 g2 b0 b2 a0 a2
++        __vector unsigned short _13 = vec_mergel(v01, v23);  // r1 r3 g1 g3 b1 b3 a1 a3
++        __vector unsigned short rg  = vec_mergeh(_02, _13);  // r0 r1 r2 r3 g0 g1 g2 g3
++        __vector unsigned short ba  = vec_mergel(_02, _13);  // b0 b1 b2 b3 a0 a1 a2 a3
++        *r = sk_unaligned_load<U16>((const uint16_t*)&rg + 0);
++        *g = sk_unaligned_load<U16>((const uint16_t*)&rg + 4);
++        *b = sk_unaligned_load<U16>((const uint16_t*)&ba + 0);
++        *a = sk_unaligned_load<U16>((const uint16_t*)&ba + 4);
++    }
++
++    SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
++        __vector unsigned short rw = widen_cast<__vector unsigned short>(r);
++        __vector unsigned short gw = widen_cast<__vector unsigned short>(g);
++        __vector unsigned short bw = widen_cast<__vector unsigned short>(b);
++        __vector unsigned short aw = widen_cast<__vector unsigned short>(a);
++        __vector unsigned short rg = vec_mergeh(rw, gw);  // r0 g0 r1 g1 r2 g2 r3 g3
++        __vector unsigned short ba = vec_mergeh(bw, aw);  // b0 a0 b1 a1 b2 a2 b3 a3
++        // Now interleave 32-bit lanes (each rg pair = 1 lane, each ba pair = 1 lane).
++        __vector unsigned int rgba_lo = vec_mergeh((__vector unsigned int)rg,
++                                                    (__vector unsigned int)ba);
++        __vector unsigned int rgba_hi = vec_mergel((__vector unsigned int)rg,
++                                                    (__vector unsigned int)ba);
++        vec_xst((__vector unsigned char)rgba_lo, 0, (unsigned char*)ptr);
++        vec_xst((__vector unsigned char)rgba_hi, 0, (unsigned char*)(ptr + 8));
++    }
++
++    SI void load2(const float* ptr, F* r, F* g) {
++        __vector float _01 = vec_xl(0, ptr);          // r0 g0 r1 g1
++        __vector float _23 = vec_xl(0, ptr + 4);      // r2 g2 r3 g3
++        // r = lanes {_01[0], _01[2], _23[0], _23[2]}; g = {_01[1], _01[3], _23[1], _23[3]}.
++        const __vector unsigned char r_mask = (__vector unsigned char){
++            0,1,2,3, 8,9,10,11, 16,17,18,19, 24,25,26,27
++        };
++        const __vector unsigned char g_mask = (__vector unsigned char){
++            4,5,6,7, 12,13,14,15, 20,21,22,23, 28,29,30,31
++        };
++        *r = (F)vec_perm((__vector unsigned char)_01, (__vector unsigned char)_23, r_mask);
++        *g = (F)vec_perm((__vector unsigned char)_01, (__vector unsigned char)_23, g_mask);
++    }
++
++    SI void store2(float* ptr, F r, F g) {
++        __vector float _01 = vec_mergeh((__vector float)r, (__vector float)g);   // r0 g0 r1 g1
++        __vector float _23 = vec_mergel((__vector float)r, (__vector float)g);   // r2 g2 r3 g3
++        vec_xst((__vector unsigned char)_01, 0, (unsigned char*)ptr);
++        vec_xst((__vector unsigned char)_23, 0, (unsigned char*)(ptr + 4));
++    }
++
++    SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
++        // 4x4 float matrix transpose: rows -> columns.
++        __vector float row0 = vec_xl(0, ptr +  0);
++        __vector float row1 = vec_xl(0, ptr +  4);
++        __vector float row2 = vec_xl(0, ptr +  8);
++        __vector float row3 = vec_xl(0, ptr + 12);
++        __vector float T0 = vec_mergeh(row0, row2);  // {row0[0], row2[0], row0[1], row2[1]}
++        __vector float T1 = vec_mergeh(row1, row3);
++        __vector float T2 = vec_mergel(row0, row2);
++        __vector float T3 = vec_mergel(row1, row3);
++        *r = (F)vec_mergeh(T0, T1);  // {row0[0], row1[0], row2[0], row3[0]}
++        *g = (F)vec_mergel(T0, T1);
++        *b = (F)vec_mergeh(T2, T3);
++        *a = (F)vec_mergel(T2, T3);
++    }
++
++    SI void store4(float* ptr, F r, F g, F b, F a) {
++        // 4x4 float matrix transpose, then store rows.
++        __vector float T0 = vec_mergeh((__vector float)r, (__vector float)b);
++        __vector float T1 = vec_mergeh((__vector float)g, (__vector float)a);
++        __vector float T2 = vec_mergel((__vector float)r, (__vector float)b);
++        __vector float T3 = vec_mergel((__vector float)g, (__vector float)a);
++        vec_xst((__vector unsigned char)vec_mergeh(T0, T1), 0, (unsigned char*)(ptr +  0));
++        vec_xst((__vector unsigned char)vec_mergel(T0, T1), 0, (unsigned char*)(ptr +  4));
++        vec_xst((__vector unsigned char)vec_mergeh(T2, T3), 0, (unsigned char*)(ptr +  8));
++        vec_xst((__vector unsigned char)vec_mergel(T2, T3), 0, (unsigned char*)(ptr + 12));
++    }
++
+ #elif defined(SKRP_CPU_SKX)
+     template <typename T> using V = Vec<16, T>;
+     using F   = V<float   >;
+diff --git a/gfx/skia/skia/src/opts/SkSwizzler_opts.inc b/gfx/skia/skia/src/opts/SkSwizzler_opts.inc
+index 671db3f05f61..c578238a9e58 100644
+--- a/gfx/skia/skia/src/opts/SkSwizzler_opts.inc
++++ b/gfx/skia/skia/src/opts/SkSwizzler_opts.inc
+@@ -84,6 +84,29 @@ SI float reciprocal_alpha(float a) {
+     auto q = F4{1.0f} / vA;
+     return _mm_and_ps(sk_bit_cast<__m128>(vA != F4{0.0f}), q)[0];
+ }
++#elif defined(SK_CPU_PPC) && defined(__VSX__)
++// -- VSX -- Harden against timing attacks.
++// vec_splats / vec_div / vec_cmpgt / vec_and each map to a single VSX op on
++// both GCC and Clang. vec_cmpgt(vA, 0) is exact for the non-negative-alpha
++// contract (0 <= a) and avoids Clang's static_cast<float>(vector) extension
++// that GCC does not support.
++SK_NO_SANITIZE("float-divide-by-zero")
++SI float reciprocal_alpha_times_255(float a) {
++    SkASSERT(0 <= a && a <= 255);
++    __vector float vA = vec_splats(a);
++    __vector float q = vec_div(vec_splats(255.0f), vA);
++    __vector float vMask = (__vector float)vec_cmpgt(vA, vec_splats(0.0f));
++    return vec_and(vMask, q)[0];
++}
++
++SK_NO_SANITIZE("float-divide-by-zero")
++SI float reciprocal_alpha(float a) {
++    SkASSERT(0 <= a && a <= 1);
++    __vector float vA = vec_splats(a);
++    __vector float q = vec_div(vec_splats(1.0f), vA);
++    __vector float vMask = (__vector float)vec_cmpgt(vA, vec_splats(0.0f));
++    return vec_and(vMask, q)[0];
++}
+ #else
+ // -- Portable -- *Not* hardened against timing attacks
+ SI float reciprocal_alpha_times_255(float a) {
+@@ -1085,6 +1108,208 @@ void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
+     rgbA_to_BGRA_portable(dst, src, count);
+ }
+ 
++#elif defined(SK_CPU_PPC) && defined(__VSX__)
++// -- VSX -- Native Power VSX/AltiVec ports of the SSSE3 swizzlers below.
++// Each _mm_* operation is replaced by the corresponding vec_* sequence per
++// the GCC ppc_wrappers translation pattern (vec_mergeh/l, vec_perm, and the
++// vec_vmuleuh/vmulouh + permute idiom for _mm_mulhi_epu16). The permute
++// masks for byte-shuffles use the same byte-order layout as the SSE
++// _mm_setr_epi8 forms because PPC64 LE register-to-memory byte order is the
++// same as x86 LE.
++
++// Scale: ((x*y) + 128) * 257 >> 16, per 16-bit lane (matches the SSSE3 form).
++static inline __vector unsigned short scale(__vector unsigned short x, __vector unsigned short y) {
++    const __vector unsigned short v128 = vec_splats((unsigned short)128);
++    const __vector unsigned short v257 = vec_splats((unsigned short)257);
++    __vector unsigned short summ = (__vector unsigned short)((__vector unsigned short)(x * y) + v128);
++    // _mm_mulhi_epu16 equivalent: 16x16 -> high 16 bits, via mule+mulo+permute.
++    __vector unsigned int even = vec_vmuleuh(summ, v257);
++    __vector unsigned int odd  = vec_vmulouh(summ, v257);
++    const __vector unsigned char xform = (__vector unsigned char){
++        0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
++        0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
++    };
++    return (__vector unsigned short)vec_perm((__vector unsigned char)even,
++                                             (__vector unsigned char)odd, xform);
++}
++
++static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
++    auto premul8 = [=](__vector unsigned char* lo, __vector unsigned char* hi) {
++        const __vector unsigned char zeros = (__vector unsigned char){0};
++        const __vector unsigned char planar = kSwapRB
++            ? (__vector unsigned char){2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15}
++            : (__vector unsigned char){0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15};
++
++        // Swizzle each 16-byte chunk into 8-bit planar layout.
++        *lo = vec_perm(*lo, *lo, planar);                        // rrrrgggg bbbbaaaa
++        *hi = vec_perm(*hi, *hi, planar);                        // RRRRGGGG BBBBAAAA
++
++        // Interleave the two halves at 32-bit granularity.
++        __vector unsigned char rg = (__vector unsigned char)
++            vec_mergeh((__vector unsigned int)*lo, (__vector unsigned int)*hi);  // rrrrRRRR ggggGGGG
++        __vector unsigned char ba = (__vector unsigned char)
++            vec_mergel((__vector unsigned int)*lo, (__vector unsigned int)*hi);  // bbbbBBBB aaaaAAAA
++
++        // Unpack to 16-bit planar.
++        __vector unsigned short r = (__vector unsigned short)vec_mergeh(rg, zeros);
++        __vector unsigned short g = (__vector unsigned short)vec_mergel(rg, zeros);
++        __vector unsigned short b = (__vector unsigned short)vec_mergeh(ba, zeros);
++        __vector unsigned short a = (__vector unsigned short)vec_mergel(ba, zeros);
++
++        // Premultiply each colour channel by alpha.
++        r = scale(r, a);
++        g = scale(g, a);
++        b = scale(b, a);
++
++        // Repack into interlaced pixels.
++        const __vector unsigned short v8 = vec_splats((unsigned short)8);
++        __vector unsigned short rg2 = vec_or(r, vec_sl(g, v8));
++        __vector unsigned short ba2 = vec_or(b, vec_sl(a, v8));
++        *lo = (__vector unsigned char)vec_mergeh(rg2, ba2);
++        *hi = (__vector unsigned char)vec_mergel(rg2, ba2);
++    };
++
++    while (count >= 8) {
++        __vector unsigned char lo = vec_xl(0, (const unsigned char*)(src + 0));
++        __vector unsigned char hi = vec_xl(0, (const unsigned char*)(src + 4));
++        premul8(&lo, &hi);
++        vec_xst(lo, 0, (unsigned char*)(dst + 0));
++        vec_xst(hi, 0, (unsigned char*)(dst + 4));
++        src += 8; dst += 8; count -= 8;
++    }
++
++    if (count >= 4) {
++        __vector unsigned char lo = vec_xl(0, (const unsigned char*)src);
++        __vector unsigned char hi = (__vector unsigned char){0};
++        premul8(&lo, &hi);
++        vec_xst(lo, 0, (unsigned char*)dst);
++        src += 4; dst += 4; count -= 4;
++    }
++
++    auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
++    proc(dst, src, count);
++}
++
++void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
++    premul_should_swapRB(false, dst, src, count);
++}
++
++void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
++    premul_should_swapRB(true, dst, src, count);
++}
++
++void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
++    const __vector unsigned char swapRB = (__vector unsigned char){
++        2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15
++    };
++    while (count >= 4) {
++        __vector unsigned char rgba = vec_xl(0, (const unsigned char*)src);
++        __vector unsigned char bgra = vec_perm(rgba, rgba, swapRB);
++        vec_xst(bgra, 0, (unsigned char*)dst);
++        src += 4; dst += 4; count -= 4;
++    }
++    RGBA_to_BGRA_portable(dst, src, count);
++}
++
++void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
++    while (count >= 8) {
++        __vector unsigned short ga = (__vector unsigned short)vec_xl(0, src);
++        __vector unsigned short gg = vec_or(
++                vec_and(ga, vec_splats((unsigned short)0x00FF)),
++                vec_sl (ga, vec_splats((unsigned short)8)));
++        __vector unsigned short ggga_lo = vec_mergeh(gg, ga);
++        __vector unsigned short ggga_hi = vec_mergel(gg, ga);
++        vec_xst((__vector unsigned char)ggga_lo, 0, (unsigned char*)(dst + 0));
++        vec_xst((__vector unsigned char)ggga_hi, 0, (unsigned char*)(dst + 4));
++        src += 8 * 2; dst += 8; count -= 8;
++    }
++    grayA_to_RGBA_portable(dst, src, count);
++}
++
++void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
++    while (count >= 8) {
++        __vector unsigned short grayA = (__vector unsigned short)vec_xl(0, src);
++        __vector unsigned short g0 = vec_and(grayA, vec_splats((unsigned short)0x00FF));
++        __vector unsigned short a0 = vec_sr   (grayA, vec_splats((unsigned short)8));
++        g0 = scale(g0, a0);
++        const __vector unsigned short v8 = vec_splats((unsigned short)8);
++        __vector unsigned short gg = vec_or(g0, vec_sl(g0, v8));
++        __vector unsigned short ga = vec_or(g0, vec_sl(a0, v8));
++        __vector unsigned short ggga_lo = vec_mergeh(gg, ga);
++        __vector unsigned short ggga_hi = vec_mergel(gg, ga);
++        vec_xst((__vector unsigned char)ggga_lo, 0, (unsigned char*)(dst + 0));
++        vec_xst((__vector unsigned char)ggga_hi, 0, (unsigned char*)(dst + 4));
++        src += 8 * 2; dst += 8; count -= 8;
++    }
++    grayA_to_rgbA_portable(dst, src, count);
++}
++
++enum Format { kRGB1, kBGR1 };
++static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
++    auto convert8 = [=](__vector unsigned char* lo, __vector unsigned char* hi) {
++        const __vector unsigned char zeros = (__vector unsigned char){0};
++        const __vector unsigned char planar = (kBGR1 == format)
++            ? (__vector unsigned char){2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15}
++            : (__vector unsigned char){0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15};
++
++        *lo = vec_perm(*lo, *lo, planar);                        // ccccmmmm yyyykkkk
++        *hi = vec_perm(*hi, *hi, planar);                        // CCCCMMMM YYYYKKKK
++        __vector unsigned char cm = (__vector unsigned char)
++            vec_mergeh((__vector unsigned int)*lo, (__vector unsigned int)*hi);
++        __vector unsigned char yk = (__vector unsigned char)
++            vec_mergel((__vector unsigned int)*lo, (__vector unsigned int)*hi);
++
++        __vector unsigned short c = (__vector unsigned short)vec_mergeh(cm, zeros);
++        __vector unsigned short m = (__vector unsigned short)vec_mergel(cm, zeros);
++        __vector unsigned short y = (__vector unsigned short)vec_mergeh(yk, zeros);
++        __vector unsigned short k = (__vector unsigned short)vec_mergel(yk, zeros);
++
++        __vector unsigned short r = scale(c, k);
++        __vector unsigned short g = scale(m, k);
++        __vector unsigned short b = scale(y, k);
++
++        const __vector unsigned short v8 = vec_splats((unsigned short)8);
++        __vector unsigned short rg = vec_or(r, vec_sl(g, v8));
++        __vector unsigned short ba = vec_or(b, vec_splats((unsigned short)0xFF00));
++        *lo = (__vector unsigned char)vec_mergeh(rg, ba);
++        *hi = (__vector unsigned char)vec_mergel(rg, ba);
++    };
++
++    while (count >= 8) {
++        __vector unsigned char lo = vec_xl(0, (const unsigned char*)(src + 0));
++        __vector unsigned char hi = vec_xl(0, (const unsigned char*)(src + 4));
++        convert8(&lo, &hi);
++        vec_xst(lo, 0, (unsigned char*)(dst + 0));
++        vec_xst(hi, 0, (unsigned char*)(dst + 4));
++        src += 8; dst += 8; count -= 8;
++    }
++    if (count >= 4) {
++        __vector unsigned char lo = vec_xl(0, (const unsigned char*)src);
++        __vector unsigned char hi = (__vector unsigned char){0};
++        convert8(&lo, &hi);
++        vec_xst(lo, 0, (unsigned char*)dst);
++        src += 4; dst += 4; count -= 4;
++    }
++    auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
++    proc(dst, src, count);
++}
++
++void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
++    inverted_cmyk_to(kRGB1, dst, src, count);
++}
++
++void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
++    inverted_cmyk_to(kBGR1, dst, src, count);
++}
++
++void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
++    rgbA_to_RGBA_portable(dst, src, count);
++}
++
++void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
++    rgbA_to_BGRA_portable(dst, src, count);
++}
++
+ #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
+ // -- LASX ----------------------------------------------------------------------------------------
+ 
+@@ -1736,6 +1961,39 @@ static void gray_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count)
+         }
+         gray_to_RGB1_portable(dst, src, count);
+     }
++#elif defined(SK_CPU_PPC) && defined(__VSX__)
++    void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
++        const __vector unsigned char alphas = vec_splats((unsigned char)0xFF);
++        while (count >= 16) {
++            __vector unsigned char grays = vec_xl(0, src);
++
++            // Replicate gray byte: gg = unpack(gray, gray) per 8-bit lane.
++            __vector unsigned char gg_lo = vec_mergeh(grays, grays);
++            __vector unsigned char gg_hi = vec_mergel(grays, grays);
++            __vector unsigned char ga_lo = vec_mergeh(grays, alphas);
++            __vector unsigned char ga_hi = vec_mergel(grays, alphas);
++
++            // Interleave g-pairs and ga-pairs at 16-bit granularity.
++            __vector unsigned short ggga0 = vec_mergeh((__vector unsigned short)gg_lo,
++                                                       (__vector unsigned short)ga_lo);
++            __vector unsigned short ggga1 = vec_mergel((__vector unsigned short)gg_lo,
++                                                       (__vector unsigned short)ga_lo);
++            __vector unsigned short ggga2 = vec_mergeh((__vector unsigned short)gg_hi,
++                                                       (__vector unsigned short)ga_hi);
++            __vector unsigned short ggga3 = vec_mergel((__vector unsigned short)gg_hi,
++                                                       (__vector unsigned short)ga_hi);
++
++            vec_xst((__vector unsigned char)ggga0, 0, (unsigned char*)(dst +  0));
++            vec_xst((__vector unsigned char)ggga1, 0, (unsigned char*)(dst +  4));
++            vec_xst((__vector unsigned char)ggga2, 0, (unsigned char*)(dst +  8));
++            vec_xst((__vector unsigned char)ggga3, 0, (unsigned char*)(dst + 12));
++
++            src += 16;
++            dst += 16;
++            count -= 16;
++        }
++        gray_to_RGB1_portable(dst, src, count);
++    }
+ #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
+     /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
+         const __m256i alphas = __lasx_xvreplgr2vr_b(0xFF);
+@@ -1920,6 +2178,37 @@ static void RGB_to_BGR1_portable(uint32_t dst[], const uint8_t* src, int count)
+         proc(dst, src, count);
+     }
+ 
++    void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
++        insert_alpha_should_swaprb(false, dst, src, count);
++    }
++    void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
++        insert_alpha_should_swaprb(true, dst, src, count);
++    }
++#elif defined(SK_CPU_PPC) && defined(__VSX__)
++    static void insert_alpha_should_swaprb(bool kSwapRB,
++                                           uint32_t dst[], const uint8_t* src, int count) {
++        // alphaMask = 0xFF000000 per 32-bit lane -> bytes (in LE memory layout) are
++        // {00,00,00,FF, 00,00,00,FF, ...}.
++        const __vector unsigned char alphaMask = (__vector unsigned char){
++            0,0,0,0xFF, 0,0,0,0xFF, 0,0,0,0xFF, 0,0,0,0xFF
++        };
++        // 'X' (= 0) is irrelevant: the alphaMask OR overwrites those lanes with FF.
++        const __vector unsigned char expand = kSwapRB
++            ? (__vector unsigned char){2,1,0,0, 5,4,3,0, 8,7,6,0, 11,10,9,0}
++            : (__vector unsigned char){0,1,2,0, 3,4,5,0, 6,7,8,0,  9,10,11,0};
++
++        while (count >= 6) {
++            __vector unsigned char rgb = vec_xl(0, src);
++            __vector unsigned char rgba = vec_or(vec_perm(rgb, rgb, expand), alphaMask);
++            vec_xst(rgba, 0, (unsigned char*)dst);
++            src += 4*3;
++            dst += 4;
++            count -= 4;
++        }
++        auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
++        proc(dst, src, count);
++    }
++
+     void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
+         insert_alpha_should_swaprb(false, dst, src, count);
+     }
+
+base-commit: a8d530ac13f0ce7e937c047f01f0d36764f5d34e
+-- 
+2.52.0
+

diff --git a/0002-Add-VSX-instructions-for-libwebp.patch b/0002-Add-VSX-instructions-for-libwebp.patch
new file mode 100644
index 0000000..1f857a7
--- /dev/null
+++ b/0002-Add-VSX-instructions-for-libwebp.patch
@@ -0,0 +1,2524 @@
+From b9e116898830a0f9edd1b0566651ce2d4989618d Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Trung=20L=C3=AA?= <8@tle.id.au>
+Date: Fri, 12 Jun 2026 15:30:13 +1000
+Subject: [PATCH 2/3] Add VSX instructions for libwebp
+
+Assisted-by: Lance Albertson <lance@osuosl.org>
+Assisted-by: Thushan Fernando <thushan@thushanfernando.com>
+---
+ media/libwebp/src/dsp/alpha_processing.c     |   6 +
+ media/libwebp/src/dsp/alpha_processing_vsx.c | 246 +++++++
+ media/libwebp/src/dsp/cpu.h                  |  14 +-
+ media/libwebp/src/dsp/dec.c                  |   6 +
+ media/libwebp/src/dsp/dec_vsx.c              | 737 +++++++++++++++++++
+ media/libwebp/src/dsp/filters.c              |   6 +
+ media/libwebp/src/dsp/filters_vsx.c          | 162 ++++
+ media/libwebp/src/dsp/lossless.c             |   6 +
+ media/libwebp/src/dsp/lossless_vsx.c         | 449 +++++++++++
+ media/libwebp/src/dsp/moz.build              |  14 +
+ media/libwebp/src/dsp/rescaler.c             |   6 +
+ media/libwebp/src/dsp/rescaler_vsx.c         | 201 +++++
+ media/libwebp/src/dsp/upsampling.c           |  12 +
+ media/libwebp/src/dsp/upsampling_vsx.c       | 151 ++++
+ media/libwebp/src/dsp/yuv.c                  |   6 +
+ media/libwebp/src/dsp/yuv.h                  |  21 +
+ media/libwebp/src/dsp/yuv_vsx.c              | 206 ++++++
+ media/libwebp/src/moz/cpu.cpp                |   4 +
+ 18 files changed, 2252 insertions(+), 1 deletion(-)
+ create mode 100644 media/libwebp/src/dsp/alpha_processing_vsx.c
+ create mode 100644 media/libwebp/src/dsp/dec_vsx.c
+ create mode 100644 media/libwebp/src/dsp/filters_vsx.c
+ create mode 100644 media/libwebp/src/dsp/lossless_vsx.c
+ create mode 100644 media/libwebp/src/dsp/rescaler_vsx.c
+ create mode 100644 media/libwebp/src/dsp/upsampling_vsx.c
+ create mode 100644 media/libwebp/src/dsp/yuv_vsx.c
+
+diff --git a/media/libwebp/src/dsp/alpha_processing.c b/media/libwebp/src/dsp/alpha_processing.c
+index 4927e73e81bf..5f9152bf701a 100644
+--- a/media/libwebp/src/dsp/alpha_processing.c
++++ b/media/libwebp/src/dsp/alpha_processing.c
+@@ -434,6 +434,7 @@ extern void WebPInitAlphaProcessingMIPSdspR2(void);
+ extern void WebPInitAlphaProcessingSSE2(void);
+ extern void WebPInitAlphaProcessingSSE41(void);
+ extern void WebPInitAlphaProcessingNEON(void);
++extern void WebPInitAlphaProcessingVSX(void);
+ 
+ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
+   WebPMultARGBRow = WebPMultARGBRow_C;
+@@ -472,6 +473,11 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
+     if (VP8GetCPUInfo(kMIPSdspR2)) {
+       WebPInitAlphaProcessingMIPSdspR2();
+     }
++#endif
++#if defined(WEBP_HAVE_VSX)
++    if (VP8GetCPUInfo(kVSX)) {
++      WebPInitAlphaProcessingVSX();
++    }
+ #endif
+   }
+ 
+diff --git a/media/libwebp/src/dsp/alpha_processing_vsx.c b/media/libwebp/src/dsp/alpha_processing_vsx.c
+new file mode 100644
+index 000000000000..2aad1cd8b648
+--- /dev/null
++++ b/media/libwebp/src/dsp/alpha_processing_vsx.c
+@@ -0,0 +1,246 @@
++// Copyright 2014 Google Inc. All Rights Reserved.
++//
++// Use of this source code is governed by a BSD-style license
++// that can be found in the COPYING file in the root of the source
++// tree. An additional intellectual property rights grant can be found
++// in the file PATENTS. All contributing project authors may
++// be found in the AUTHORS file in the root of the source tree.
++// -----------------------------------------------------------------------------
++//
++// VSX (PowerPC) version of alpha processing functions.
++
++#include "src/dsp/dsp.h"
++
++#if defined(WEBP_USE_VSX)
++
++#include <altivec.h>
++#include <string.h>
++
++#include "src/dsp/cpu.h"
++#include "src/webp/types.h"
++
++typedef __vector unsigned char u8x16;
++typedef __vector unsigned short u16x8;
++typedef __vector signed short i16x8;
++typedef __vector unsigned int u32x4;
++typedef __vector signed int i32x4;
++
++//------------------------------------------------------------------------------
++// Alpha dispatch / extraction.
++
++static int DispatchAlpha_VSX(const uint8_t* WEBP_RESTRICT alpha,
++                             int alpha_stride, int width, int height,
++                             uint8_t* WEBP_RESTRICT dst, int dst_stride) {
++  uint32_t alpha_and = 0xff;
++  int i, j, k;
++  const u8x16 zero = vec_splats((unsigned char)0);
++  const u16x8 z16 = vec_splats((unsigned short)0);
++  const u32x4 a_mask = vec_splats((uint32_t)0xff);  // selects the low byte
++  u8x16 all_and = vec_splats((unsigned char)0xff);
++  const int limit = width & ~15;
++
++  for (j = 0; j < height; ++j) {
++    uint8_t* ptr = dst;
++    for (i = 0; i < limit; i += 16) {
++      const u8x16 a0 = vec_xl(0, (unsigned char*)&alpha[i]);
++      // Spread the 16 alpha bytes to the low byte of 16 32-bit lanes.
++      const u16x8 a1_lo = (u16x8)vec_mergeh(a0, zero);
++      const u16x8 a1_hi = (u16x8)vec_mergel(a0, zero);
++      const u32x4 s0 = (u32x4)vec_mergeh(a1_lo, z16);
++      const u32x4 s1 = (u32x4)vec_mergel(a1_lo, z16);
++      const u32x4 s2 = (u32x4)vec_mergeh(a1_hi, z16);
++      const u32x4 s3 = (u32x4)vec_mergel(a1_hi, z16);
++      const u32x4* spread[4] = {&s0, &s1, &s2, &s3};
++      for (k = 0; k < 4; ++k) {
++        const u32x4 d = vec_xl(0, (uint32_t*)(ptr + 16 * k));
++        vec_xst(vec_sel(d, *spread[k], a_mask), 0, (uint32_t*)(ptr + 16 * k));
++      }
++      all_and = vec_and(all_and, a0);
++      ptr += 64;
++    }
++    for (; i < width; ++i) {
++      const uint32_t alpha_value = alpha[i];
++      dst[4 * i] = alpha_value;
++      alpha_and &= alpha_value;
++    }
++    alpha += alpha_stride;
++    dst += dst_stride;
++  }
++  {
++    unsigned char tmp[16];
++    memcpy(tmp, &all_and, 16);
++    for (k = 0; k < 16; ++k) alpha_and &= tmp[k];
++  }
++  return (alpha_and != 0xff);
++}
++
++static void DispatchAlphaToGreen_VSX(const uint8_t* WEBP_RESTRICT alpha,
++                                     int alpha_stride, int width, int height,
++                                     uint32_t* WEBP_RESTRICT dst,
++                                     int dst_stride) {
++  int i, j;
++  const u8x16 zero = vec_splats((unsigned char)0);
++  const u16x8 z16 = vec_splats((unsigned short)0);
++  const int limit = width & ~15;
++  for (j = 0; j < height; ++j) {
++    for (i = 0; i < limit; i += 16) {
++      const u8x16 a0 = vec_xl(0, (unsigned char*)&alpha[i]);
++      // Place each alpha byte into the green slot (<< 8) of a 32-bit lane.
++      const u16x8 a1_lo = (u16x8)vec_mergeh(zero, a0);  // note the 'zero' first
++      const u16x8 a1_hi = (u16x8)vec_mergel(zero, a0);
++      const u32x4 g0 = (u32x4)vec_mergeh(a1_lo, z16);
++      const u32x4 g1 = (u32x4)vec_mergel(a1_lo, z16);
++      const u32x4 g2 = (u32x4)vec_mergeh(a1_hi, z16);
++      const u32x4 g3 = (u32x4)vec_mergel(a1_hi, z16);
++      vec_xst(g0, 0, &dst[i + 0]);
++      vec_xst(g1, 0, &dst[i + 4]);
++      vec_xst(g2, 0, &dst[i + 8]);
++      vec_xst(g3, 0, &dst[i + 12]);
++    }
++    for (; i < width; ++i) dst[i] = alpha[i] << 8;
++    alpha += alpha_stride;
++    dst += dst_stride;
++  }
++}
++
++static int ExtractAlpha_VSX(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
++                            int width, int height, uint8_t* WEBP_RESTRICT alpha,
++                            int alpha_stride) {
++  uint32_t alpha_and = 0xff;
++  int i, j, k;
++  const u32x4 a_mask = vec_splats((uint32_t)0xff);  // keeps the low byte
++  u8x16 all_and = vec_splats((unsigned char)0xff);
++  const int limit = width & ~7;
++
++  for (j = 0; j < height; ++j) {
++    const uint32_t* src = (const uint32_t*)argb;
++    for (i = 0; i < limit; i += 8) {
++      const u32x4 a0 = vec_and(vec_xl(0, (uint32_t*)(src + 0)), a_mask);
++      const u32x4 a1 = vec_and(vec_xl(0, (uint32_t*)(src + 4)), a_mask);
++      const i16x8 c0 = vec_packs((i32x4)a0, (i32x4)a1);
++      const u8x16 d0 = vec_packsu(c0, c0);  // 8 alpha bytes in the low half
++      memcpy(&alpha[i], &d0, 8);
++      all_and = vec_and(all_and, d0);
++      src += 8;
++    }
++    for (; i < width; ++i) {
++      const uint32_t alpha_value = argb[4 * i];
++      alpha[i] = alpha_value;
++      alpha_and &= alpha_value;
++    }
++    argb += argb_stride;
++    alpha += alpha_stride;
++  }
++  {
++    unsigned char tmp[16];
++    memcpy(tmp, &all_and, 16);
++    for (k = 0; k < 8; ++k) alpha_and &= tmp[k];
++  }
++  return (alpha_and == 0xff);
++}
++
++static void ExtractGreen_VSX(const uint32_t* WEBP_RESTRICT argb,
++                             uint8_t* WEBP_RESTRICT alpha, int size) {
++  int i;
++  const u32x4 mask = vec_splats((uint32_t)0xff);
++  const u32x4 sh8 = vec_splats((uint32_t)8);
++  for (i = 0; i + 16 <= size; i += 16) {
++    const u32x4 a0 =
++        vec_and(vec_sr(vec_xl(0, (uint32_t*)(argb + i + 0)), sh8), mask);
++    const u32x4 a1 =
++        vec_and(vec_sr(vec_xl(0, (uint32_t*)(argb + i + 4)), sh8), mask);
++    const u32x4 a2 =
++        vec_and(vec_sr(vec_xl(0, (uint32_t*)(argb + i + 8)), sh8), mask);
++    const u32x4 a3 =
++        vec_and(vec_sr(vec_xl(0, (uint32_t*)(argb + i + 12)), sh8), mask);
++    const i16x8 d0 = vec_packs((i32x4)a0, (i32x4)a1);
++    const i16x8 d1 = vec_packs((i32x4)a2, (i32x4)a3);
++    const u8x16 e = vec_packsu(d0, d1);
++    vec_xst(e, 0, &alpha[i]);
++  }
++  for (; i < size; ++i) alpha[i] = argb[i] >> 8;
++}
++
++//------------------------------------------------------------------------------
++// Premultiply.
++
++#define MULTIPLIER(a) ((a) * 32897U)
++#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
++
++// Spreads the alpha lane across r/g/b and inserts 0xff in the alpha lane, for
++// the two pixels packed in a 16-bit-per-channel vector. Built against the
++// little-endian byte order; src is the channel vector, the second operand is
++// an all-0xff vector.
++static const u8x16 kSpreadAlphaLast = {6,  7,  6,  7,  6,  7,  16, 7,
++                                       14, 15, 14, 15, 14, 15, 16, 15};
++static const u8x16 kSpreadAlphaFirst = {16, 1, 0, 1, 0, 1, 0, 1,
++                                        16, 9, 8, 9, 8, 9, 8, 9};
++
++static WEBP_INLINE u16x8 MulHi16(u16x8 a, u16x8 b) {
++  const u32x4 sh = vec_splats((unsigned int)16);
++  const u32x4 e = vec_sr(vec_mule(a, b), sh);
++  const u32x4 o = vec_sr(vec_mulo(a, b), sh);
++  return vec_pack(vec_mergeh(e, o), vec_mergel(e, o));
++}
++
++static void ApplyAlphaMultiply_VSX(uint8_t* rgba, int alpha_first, int w, int h,
++                                   int stride) {
++  const u8x16 zero = vec_splats((unsigned char)0);
++  const u8x16 allff = vec_splats((unsigned char)0xff);
++  const u16x8 z16 = vec_splats((unsigned short)0);
++  const u16x8 kMult = vec_splats((unsigned short)0x8081);
++  const u16x8 sh7 = vec_splats((unsigned short)7);
++  const u8x16 ctrl = alpha_first ? kSpreadAlphaFirst : kSpreadAlphaLast;
++  const int kSpan = 4;
++  while (h-- > 0) {
++    uint8_t* const rgbx = rgba;
++    int i;
++    for (i = 0; i + kSpan <= w; i += kSpan) {
++      const u8x16 argb0 = vec_xl(0, (unsigned char*)(rgbx + 4 * i));
++      const u16x8 lo = (u16x8)vec_mergeh(argb0, zero);
++      const u16x8 hi = (u16x8)vec_mergel(argb0, zero);
++      const u16x8 a_lo = (u16x8)vec_perm((u8x16)lo, allff, ctrl);
++      const u16x8 a_hi = (u16x8)vec_perm((u8x16)hi, allff, ctrl);
++      const u16x8 A0lo = vec_mladd(a_lo, lo, z16);
++      const u16x8 A0hi = vec_mladd(a_hi, hi, z16);
++      const u16x8 A2lo = vec_sr(MulHi16(A0lo, kMult), sh7);
++      const u16x8 A2hi = vec_sr(MulHi16(A0hi, kMult), sh7);
++      const u8x16 out = vec_packsu((i16x8)A2lo, (i16x8)A2hi);
++      vec_xst(out, 0, (unsigned char*)(rgbx + 4 * i));
++    }
++    // Finish with left-overs.
++    for (; i < w; ++i) {
++      uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
++      const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
++      const uint32_t a = alpha[4 * i];
++      if (a != 0xff) {
++        const uint32_t mult = MULTIPLIER(a);
++        rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
++        rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
++        rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
++      }
++    }
++    rgba += stride;
++  }
++}
++
++#undef MULTIPLIER
++#undef PREMULTIPLY
++
++//------------------------------------------------------------------------------
++
++extern void WebPInitAlphaProcessingVSX(void);
++
++WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingVSX(void) {
++  WebPApplyAlphaMultiply = ApplyAlphaMultiply_VSX;
++  WebPDispatchAlpha = DispatchAlpha_VSX;
++  WebPDispatchAlphaToGreen = DispatchAlphaToGreen_VSX;
++  WebPExtractAlpha = ExtractAlpha_VSX;
++  WebPExtractGreen = ExtractGreen_VSX;
++}
++
++#else  // !WEBP_USE_VSX
++
++WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingVSX)
++
++#endif  // WEBP_USE_VSX
+diff --git a/media/libwebp/src/dsp/cpu.h b/media/libwebp/src/dsp/cpu.h
+index 17c4db971c7f..d1d4b3127c84 100644
+--- a/media/libwebp/src/dsp/cpu.h
++++ b/media/libwebp/src/dsp/cpu.h
+@@ -154,6 +154,17 @@
+ #define WEBP_USE_MSA
+ #endif
+ 
++//------------------------------------------------------------------------------
++// PPC64 / Power VSX (ISA 2.07 / POWER8 baseline).
++
++#if defined(__powerpc64__) && defined(__VSX__)
++#define WEBP_USE_VSX
++#endif
++
++#if defined(WEBP_USE_VSX) && !defined(WEBP_HAVE_VSX)
++#define WEBP_HAVE_VSX
++#endif
++
+ //------------------------------------------------------------------------------
+ 
+ #ifndef WEBP_DSP_OMIT_C_CODE
+@@ -308,7 +319,8 @@ typedef enum {
+   kNEON,
+   kMIPS32,
+   kMIPSdspR2,
+-  kMSA
++  kMSA,
++  kVSX
+ } CPUFeature;
+ 
+ // returns true if the CPU supports the feature.
+diff --git a/media/libwebp/src/dsp/dec.c b/media/libwebp/src/dsp/dec.c
+index 4f38309980ce..f34276ba7316 100644
+--- a/media/libwebp/src/dsp/dec.c
++++ b/media/libwebp/src/dsp/dec.c
+@@ -752,6 +752,7 @@ extern void VP8DspInitNEON(void);
+ extern void VP8DspInitMIPS32(void);
+ extern void VP8DspInitMIPSdspR2(void);
+ extern void VP8DspInitMSA(void);
++extern void VP8DspInitVSX(void);
+ 
+ WEBP_DSP_INIT_FUNC(VP8DspInit) {
+   VP8InitClipTables();
+@@ -843,6 +844,11 @@ WEBP_DSP_INIT_FUNC(VP8DspInit) {
+     if (VP8GetCPUInfo(kMSA)) {
+       VP8DspInitMSA();
+     }
++#endif
++#if defined(WEBP_HAVE_VSX)
++    if (VP8GetCPUInfo(kVSX)) {
++      VP8DspInitVSX();
++    }
+ #endif
+   }
+ 
+diff --git a/media/libwebp/src/dsp/dec_vsx.c b/media/libwebp/src/dsp/dec_vsx.c
+new file mode 100644
+index 000000000000..e0c1cbc3b71b
+--- /dev/null
++++ b/media/libwebp/src/dsp/dec_vsx.c
+@@ -0,0 +1,737 @@
++// Copyright 2011 Google Inc. All Rights Reserved.
++//
++// Use of this source code is governed by a BSD-style license
++// that can be found in the COPYING file in the root of the source
++// tree. An additional intellectual property rights grant can be found
++// in the file PATENTS. All contributing project authors may
++// be found in the AUTHORS file in the root of the source tree.
++// -----------------------------------------------------------------------------
++//
++// VSX (PowerPC) version of decoding functions.
++
++#include "src/dsp/dsp.h"
++
++#if defined(WEBP_USE_VSX)
++
++#include <altivec.h>
++#include <string.h>
++
++typedef __vector signed short     i16x8;
++typedef __vector unsigned short   u16x8;
++typedef __vector signed int       i32x4;
++typedef __vector unsigned int     u32x4;
++typedef __vector unsigned char    u8x16;
++typedef __vector signed char      i8x16;
++typedef __vector signed long long i64x2;
++
++// Signed multiply-high of packed 16-bit lanes (POWER8 has no vmulhsh).
++static WEBP_INLINE i16x8 MulHi16_S(i16x8 a, i16x8 b) {
++  const u32x4 sh = vec_splats((unsigned int)16);
++  const i32x4 e = vec_sra(vec_mule(a, b), sh);
++  const i32x4 o = vec_sra(vec_mulo(a, b), sh);
++  return (i16x8)vec_pack(vec_mergeh(e, o), vec_mergel(e, o));
++}
++
++// Transpose two interleaved 4x4 blocks of 16-bit values.
++static WEBP_INLINE void Transpose2_4x4(i16x8 in0, i16x8 in1, i16x8 in2,
++                                       i16x8 in3, i16x8* out0, i16x8* out1,
++                                       i16x8* out2, i16x8* out3) {
++  const i16x8 t0 = (i16x8)vec_mergeh(in0, in1);
++  const i16x8 t1 = (i16x8)vec_mergeh(in2, in3);
++  const i16x8 t2 = (i16x8)vec_mergel(in0, in1);
++  const i16x8 t3 = (i16x8)vec_mergel(in2, in3);
++  const i32x4 u0 = vec_mergeh((i32x4)t0, (i32x4)t1);
++  const i32x4 u1 = vec_mergeh((i32x4)t2, (i32x4)t3);
++  const i32x4 u2 = vec_mergel((i32x4)t0, (i32x4)t1);
++  const i32x4 u3 = vec_mergel((i32x4)t2, (i32x4)t3);
++  *out0 = (i16x8)vec_mergeh((i64x2)u0, (i64x2)u1);
++  *out1 = (i16x8)vec_mergel((i64x2)u0, (i64x2)u1);
++  *out2 = (i16x8)vec_mergeh((i64x2)u2, (i64x2)u3);
++  *out3 = (i16x8)vec_mergel((i64x2)u2, (i64x2)u3);
++}
++
++// Bounded 4-coefficient load into the low half of a 16-bit vector.
++static WEBP_INLINE i16x8 Load4Coeffs(const int16_t* WEBP_RESTRICT p) {
++  int16_t tmp[8] = {0};
++  memcpy(tmp, p, 4 * sizeof(int16_t));
++  return *(const i16x8*)tmp;
++}
++
++// Bounded load of n pixels, zero-extended to 16-bit lanes.
++static WEBP_INLINE i16x8 LoadDst(const uint8_t* WEBP_RESTRICT p, int n) {
++  unsigned char tmp[16] = {0};
++  memcpy(tmp, p, n);
++  return (i16x8)vec_mergeh(vec_xl(0, tmp), vec_splats((unsigned char)0));
++}
++
++static void Transform_VSX(const int16_t* WEBP_RESTRICT in,
++                          uint8_t* WEBP_RESTRICT dst, int do_two) {
++  const i16x8 k1 = vec_splats((short)20091);
++  const i16x8 k2 = vec_splats((short)-30068);
++  const u16x8 three = vec_splats((unsigned short)3);
++  i16x8 in0 = Load4Coeffs(in + 0), in1 = Load4Coeffs(in + 4);
++  i16x8 in2 = Load4Coeffs(in + 8), in3 = Load4Coeffs(in + 12);
++  i16x8 T0, T1, T2, T3;
++
++  if (do_two) {
++    in0 = (i16x8)vec_mergeh((i64x2)in0, (i64x2)Load4Coeffs(in + 16));
++    in1 = (i16x8)vec_mergeh((i64x2)in1, (i64x2)Load4Coeffs(in + 20));
++    in2 = (i16x8)vec_mergeh((i64x2)in2, (i64x2)Load4Coeffs(in + 24));
++    in3 = (i16x8)vec_mergeh((i64x2)in3, (i64x2)Load4Coeffs(in + 28));
++  }
++
++  {  // Vertical pass + transpose.
++    const i16x8 a = vec_add(in0, in2);
++    const i16x8 b = vec_sub(in0, in2);
++    const i16x8 c = vec_add(vec_sub(in1, in3),
++                            vec_sub(MulHi16_S(in1, k2), MulHi16_S(in3, k1)));
++    const i16x8 d = vec_add(vec_add(in1, in3),
++                            vec_add(MulHi16_S(in1, k1), MulHi16_S(in3, k2)));
++    Transpose2_4x4(vec_add(a, d), vec_add(b, c), vec_sub(b, c), vec_sub(a, d),
++                   &T0, &T1, &T2, &T3);
++  }
++  {  // Horizontal pass + transpose.
++    const i16x8 dc = vec_add(T0, vec_splats((short)4));
++    const i16x8 a = vec_add(dc, T2);
++    const i16x8 b = vec_sub(dc, T2);
++    const i16x8 c = vec_add(vec_sub(T1, T3),
++                            vec_sub(MulHi16_S(T1, k2), MulHi16_S(T3, k1)));
++    const i16x8 d = vec_add(vec_add(T1, T3),
++                            vec_add(MulHi16_S(T1, k1), MulHi16_S(T3, k2)));
++    const i16x8 s0 = vec_sra(vec_add(a, d), three);
++    const i16x8 s1 = vec_sra(vec_add(b, c), three);
++    const i16x8 s2 = vec_sra(vec_sub(b, c), three);
++    const i16x8 s3 = vec_sra(vec_sub(a, d), three);
++    Transpose2_4x4(s0, s1, s2, s3, &T0, &T1, &T2, &T3);
++  }
++  {  // Add to the reference pixels and store with saturation.
++    const int n = do_two ? 8 : 4;
++    const i16x8 d0 = LoadDst(dst + 0 * BPS, n);
++    const i16x8 d1 = LoadDst(dst + 1 * BPS, n);
++    const i16x8 d2 = LoadDst(dst + 2 * BPS, n);
++    const i16x8 d3 = LoadDst(dst + 3 * BPS, n);
++    const u8x16 r0 = vec_packsu(vec_add(d0, T0), vec_add(d0, T0));
++    const u8x16 r1 = vec_packsu(vec_add(d1, T1), vec_add(d1, T1));
++    const u8x16 r2 = vec_packsu(vec_add(d2, T2), vec_add(d2, T2));
++    const u8x16 r3 = vec_packsu(vec_add(d3, T3), vec_add(d3, T3));
++    unsigned char b0[16], b1[16], b2[16], b3[16];
++    memcpy(b0, &r0, 16); memcpy(b1, &r1, 16);
++    memcpy(b2, &r2, 16); memcpy(b3, &r3, 16);
++    memcpy(dst + 0 * BPS, b0, n); memcpy(dst + 1 * BPS, b1, n);
++    memcpy(dst + 2 * BPS, b2, n); memcpy(dst + 3 * BPS, b3, n);
++  }
++}
++
++//------------------------------------------------------------------------------
++// Simple in-loop edge filtering.
++
++#define ABSU(p, q) vec_or(vec_subs((u8x16)(q), (u8x16)(p)), \
++                          vec_subs((u8x16)(p), (u8x16)(q)))
++
++// Per-byte signed arithmetic >>3, packed with saturation.
++static WEBP_INLINE i8x16 SignedShift3(i8x16 x) {
++  const u8x16 z = vec_splats((unsigned char)0);
++  const u16x8 sh = vec_splats((unsigned short)(3 + 8));
++  const i16x8 lo = vec_sra((i16x8)vec_mergeh(z, (u8x16)x), sh);
++  const i16x8 hi = vec_sra((i16x8)vec_mergel(z, (u8x16)x), sh);
++  return (i8x16)vec_packs(lo, hi);
++}
++
++static WEBP_INLINE void DoFilter2_VSX(u8x16* WEBP_RESTRICT p1,
++                                      u8x16* WEBP_RESTRICT p0,
++                                      u8x16* WEBP_RESTRICT q0,
++                                      u8x16* WEBP_RESTRICT q1, int thresh) {
++  const u8x16 sign = vec_splats((unsigned char)0x80);
++  const u8x16 t1 = ABSU(*p1, *q1);
++  const u8x16 t2 = vec_and(t1, vec_splats((unsigned char)0xFE));
++  const u8x16 t3 = (u8x16)vec_sr((u16x8)t2, vec_splats((unsigned short)1));
++  const u8x16 t4 = ABSU(*p0, *q0);
++  const u8x16 t6 = vec_adds(vec_adds(t4, t4), t3);
++  const u8x16 t7 = vec_subs(t6, vec_splats((unsigned char)thresh));
++  const u8x16 mask = (u8x16)vec_cmpeq(t7, vec_splats((unsigned char)0));
++
++  const i8x16 p1s = (i8x16)vec_xor(*p1, sign);
++  const i8x16 q1s = (i8x16)vec_xor(*q1, sign);
++  i8x16 P0 = (i8x16)vec_xor(*p0, sign);
++  i8x16 Q0 = (i8x16)vec_xor(*q0, sign);
++
++  const i8x16 d0 = vec_subs(Q0, P0);
++  const i8x16 s1 = vec_adds(vec_subs(p1s, q1s), d0);
++  i8x16 a = vec_adds(d0, vec_adds(d0, s1));
++  a = vec_and(a, (i8x16)mask);
++  const i8x16 v3 = SignedShift3(vec_adds(a, vec_splats((signed char)3)));
++  const i8x16 v4 = SignedShift3(vec_adds(a, vec_splats((signed char)4)));
++  Q0 = vec_subs(Q0, v4);
++  P0 = vec_adds(P0, v3);
++  *p0 = vec_xor((u8x16)P0, sign);
++  *q0 = vec_xor((u8x16)Q0, sign);
++}
++
++static void SimpleVFilter16_VSX(uint8_t* p, int stride, int thresh) {
++  u8x16 p1 = vec_xl(0, p - 2 * stride);
++  u8x16 p0 = vec_xl(0, p - stride);
++  u8x16 q0 = vec_xl(0, p);
++  u8x16 q1 = vec_xl(0, p + stride);
++  DoFilter2_VSX(&p1, &p0, &q0, &q1, thresh);
++  vec_xst(p0, 0, p - stride);
++  vec_xst(q0, 0, p);
++}
++
++static void SimpleVFilter16i_VSX(uint8_t* p, int stride, int thresh) {
++  int k;
++  for (k = 3; k > 0; --k) {
++    p += 4 * stride;
++    SimpleVFilter16_VSX(p, stride, thresh);
++  }
++}
++
++// Transpose four columns out of / into 16 rows for horizontal-edge filtering.
++static WEBP_INLINE void Load8x4(const uint8_t* WEBP_RESTRICT b, int s,
++                                u8x16* WEBP_RESTRICT p, u8x16* WEBP_RESTRICT q) {
++  uint32_t a0[4], a1[4];
++  memcpy(&a0[0], b + 0 * s, 4); memcpy(&a0[1], b + 4 * s, 4);
++  memcpy(&a0[2], b + 2 * s, 4); memcpy(&a0[3], b + 6 * s, 4);
++  memcpy(&a1[0], b + 1 * s, 4); memcpy(&a1[1], b + 5 * s, 4);
++  memcpy(&a1[2], b + 3 * s, 4); memcpy(&a1[3], b + 7 * s, 4);
++  const u8x16 A0 = vec_xl(0, (unsigned char*)a0);
++  const u8x16 A1 = vec_xl(0, (unsigned char*)a1);
++  const u8x16 B0 = vec_mergeh(A0, A1), B1 = vec_mergel(A0, A1);
++  const u16x8 C0 = vec_mergeh((u16x8)B0, (u16x8)B1);
++  const u16x8 C1 = vec_mergel((u16x8)B0, (u16x8)B1);
++  *p = (u8x16)vec_mergeh((u32x4)C0, (u32x4)C1);
++  *q = (u8x16)vec_mergel((u32x4)C0, (u32x4)C1);
++}
++
++static WEBP_INLINE void Load16x4(const uint8_t* WEBP_RESTRICT r0,
++                                 const uint8_t* WEBP_RESTRICT r8, int s,
++                                 u8x16* p1, u8x16* p0, u8x16* q0, u8x16* q1) {
++  Load8x4(r0, s, p1, q0);
++  Load8x4(r8, s, p0, q1);
++  const u8x16 t1 = *p1, t2 = *q0;
++  *p1 = (u8x16)vec_mergeh((i64x2)t1, (i64x2)*p0);
++  *p0 = (u8x16)vec_mergel((i64x2)t1, (i64x2)*p0);
++  *q0 = (u8x16)vec_mergeh((i64x2)t2, (i64x2)*q1);
++  *q1 = (u8x16)vec_mergel((i64x2)t2, (i64x2)*q1);
++}
++
++static WEBP_INLINE void Store4x4(u8x16 x, uint8_t* WEBP_RESTRICT dst, int s) {
++  unsigned char b[16];
++  int i;
++  memcpy(b, &x, 16);
++  for (i = 0; i < 4; ++i) memcpy(dst + i * s, b + 4 * i, 4);
++}
++
++static WEBP_INLINE void Store16x4(u8x16 p1, u8x16 p0, u8x16 q0, u8x16 q1,
++                                  uint8_t* WEBP_RESTRICT r0,
++                                  uint8_t* WEBP_RESTRICT r8, int s) {
++  u8x16 t = p0;
++  u8x16 p0s = vec_mergeh(p1, t), p1s = vec_mergel(p1, t);
++  t = q0;
++  u8x16 q0s = vec_mergeh(t, q1), q1s = vec_mergel(t, q1);
++  t = p0s;
++  p0s = (u8x16)vec_mergeh((u16x8)t, (u16x8)q0s);
++  q0s = (u8x16)vec_mergel((u16x8)t, (u16x8)q0s);
++  t = p1s;
++  p1s = (u8x16)vec_mergeh((u16x8)t, (u16x8)q1s);
++  q1s = (u8x16)vec_mergel((u16x8)t, (u16x8)q1s);
++  Store4x4(p0s, r0, s); Store4x4(q0s, r0 + 4 * s, s);
++  Store4x4(p1s, r8, s); Store4x4(q1s, r8 + 4 * s, s);
++}
++
++static void SimpleHFilter16_VSX(uint8_t* p, int stride, int thresh) {
++  u8x16 p1, p0, q0, q1;
++  p -= 2;  // beginning of p1
++  Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
++  DoFilter2_VSX(&p1, &p0, &q0, &q1, thresh);
++  Store16x4(p1, p0, q0, q1, p, p + 8 * stride, stride);
++}
++
++static void SimpleHFilter16i_VSX(uint8_t* p, int stride, int thresh) {
++  int k;
++  for (k = 3; k > 0; --k) {
++    p += 4;
++    SimpleHFilter16_VSX(p, stride, thresh);
++  }
++}
++
++//------------------------------------------------------------------------------
++// Complex in-loop edge filtering (vertical/luma).
++
++static const u8x16 kSignBit = {
++    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
++    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
++#define FLIPB(x) ((x) = (i8x16)vec_xor((u8x16)(x), kSignBit))
++
++static WEBP_INLINE u8x16 GetNotHEV(u8x16 p1, u8x16 p0, u8x16 q0, u8x16 q1,
++                                   int hev_thresh) {
++  const u8x16 d = vec_subs(vec_max(ABSU(p1, p0), ABSU(q1, q0)),
++                           vec_splats((unsigned char)hev_thresh));
++  return (u8x16)vec_cmpeq(d, vec_splats((unsigned char)0));
++}
++
++static WEBP_INLINE i8x16 GetBaseDelta(i8x16 p1, i8x16 p0, i8x16 q0, i8x16 q1) {
++  const i8x16 d = vec_subs(q0, p0);
++  const i8x16 s1 = vec_adds(vec_subs(p1, q1), d);
++  return vec_adds(d, vec_adds(d, s1));
++}
++
++static WEBP_INLINE void DoSimpleFilterS(i8x16* p0, i8x16* q0, i8x16 f) {
++  *q0 = vec_subs(*q0, SignedShift3(vec_adds(f, vec_splats((signed char)4))));
++  *p0 = vec_adds(*p0, SignedShift3(vec_adds(f, vec_splats((signed char)3))));
++}
++
++static WEBP_INLINE void Update2Pixels(i8x16* pi, i8x16* qi, i16x8 lo, i16x8 hi) {
++  const u16x8 s7 = vec_splats((unsigned short)7);
++  const i8x16 d = (i8x16)vec_packs(vec_sra(lo, s7), vec_sra(hi, s7));
++  *pi = vec_adds(*pi, d);
++  *qi = vec_subs(*qi, d);
++  FLIPB(*pi);
++  FLIPB(*qi);
++}
++
++// mask = (max inner abs-diff <= ithresh) && NeedsFilter(thresh).
++static WEBP_INLINE u8x16 ComplexMask(u8x16 p3, u8x16 p2, u8x16 p1, u8x16 p0,
++                                     u8x16 q0, u8x16 q1, u8x16 q2, u8x16 q3,
++                                     int thresh, int ithresh) {
++  u8x16 m = ABSU(p1, p0);
++  m = vec_max(m, ABSU(p3, p2));
++  m = vec_max(m, ABSU(p2, p1));
++  m = vec_max(m, ABSU(q1, q0));
++  m = vec_max(m, ABSU(q3, q2));
++  m = vec_max(m, ABSU(q2, q1));
++  const u8x16 tm = (u8x16)vec_cmpeq(
++      vec_subs(m, vec_splats((unsigned char)ithresh)),
++      vec_splats((unsigned char)0));
++  const u8x16 t2 = vec_and(ABSU(p1, q1), vec_splats((unsigned char)0xFE));
++  const u8x16 t3 = (u8x16)vec_sr((u16x8)t2, vec_splats((unsigned short)1));
++  const u8x16 t6 = vec_adds(vec_adds(ABSU(p0, q0), ABSU(p0, q0)), t3);
++  const u8x16 fm = (u8x16)vec_cmpeq(
++      vec_subs(t6, vec_splats((unsigned char)thresh)),
++      vec_splats((unsigned char)0));
++  return vec_and(tm, fm);
++}
++
++static WEBP_INLINE void DoFilter4(u8x16* p1u, u8x16* p0u, u8x16* q0u,
++                                  u8x16* q1u, u8x16 mask, int hev_thresh) {
++  const u8x16 zero = vec_splats((unsigned char)0);
++  const u8x16 not_hev = GetNotHEV(*p1u, *p0u, *q0u, *q1u, hev_thresh);
++  i8x16 p1 = (i8x16)vec_xor(*p1u, kSignBit), p0 = (i8x16)vec_xor(*p0u, kSignBit);
++  i8x16 q0 = (i8x16)vec_xor(*q0u, kSignBit), q1 = (i8x16)vec_xor(*q1u, kSignBit);
++  i8x16 t1 = vec_andc(vec_subs(p1, q1), (i8x16)not_hev);
++  const i8x16 t2 = vec_subs(q0, p0);
++  t1 = vec_adds(t1, t2); t1 = vec_adds(t1, t2); t1 = vec_adds(t1, t2);
++  t1 = vec_and(t1, (i8x16)mask);
++  const i8x16 a3 = SignedShift3(vec_adds(t1, vec_splats((signed char)4)));
++  p0 = vec_adds(p0, SignedShift3(vec_adds(t1, vec_splats((signed char)3))));
++  q0 = vec_subs(q0, a3);
++  FLIPB(p0); FLIPB(q0);
++  const i8x16 t = vec_add(a3, (i8x16)kSignBit);
++  i8x16 t3 = vec_sub((i8x16)vec_avg((u8x16)t, zero), vec_splats((signed char)64));
++  t3 = vec_and((i8x16)not_hev, t3);
++  q1 = vec_subs(q1, t3); p1 = vec_adds(p1, t3);
++  FLIPB(p1); FLIPB(q1);
++  *p1u = (u8x16)p1; *p0u = (u8x16)p0; *q0u = (u8x16)q0; *q1u = (u8x16)q1;
++}
++
++static WEBP_INLINE void DoFilter6(u8x16* p2u, u8x16* p1u, u8x16* p0u,
++                                  u8x16* q0u, u8x16* q1u, u8x16* q2u,
++                                  u8x16 mask, int hev_thresh) {
++  const u8x16 zero = vec_splats((unsigned char)0);
++  const u8x16 not_hev = GetNotHEV(*p1u, *p0u, *q0u, *q1u, hev_thresh);
++  i8x16 p2 = (i8x16)vec_xor(*p2u, kSignBit), p1 = (i8x16)vec_xor(*p1u, kSignBit);
++  i8x16 p0 = (i8x16)vec_xor(*p0u, kSignBit), q0 = (i8x16)vec_xor(*q0u, kSignBit);
++  i8x16 q1 = (i8x16)vec_xor(*q1u, kSignBit), q2 = (i8x16)vec_xor(*q2u, kSignBit);
++  const i8x16 a = GetBaseDelta(p1, p0, q0, q1);
++  {  // hev pixels: simple filter
++    const i8x16 f = vec_and(a, (i8x16)vec_andc(mask, not_hev));
++    DoSimpleFilterS(&p0, &q0, f);
++  }
++  {  // non-hev pixels: strong filter
++    const i8x16 f = vec_and(a, vec_and((i8x16)not_hev, (i8x16)mask));
++    const i16x8 k9 = vec_splats((short)0x0900), k63 = vec_splats((short)63);
++    const i16x8 f9lo = MulHi16_S((i16x8)vec_mergeh(zero, (u8x16)f), k9);
++    const i16x8 f9hi = MulHi16_S((i16x8)vec_mergel(zero, (u8x16)f), k9);
++    const i16x8 a2lo = vec_add(f9lo, k63), a2hi = vec_add(f9hi, k63);
++    const i16x8 a1lo = vec_add(a2lo, f9lo), a1hi = vec_add(a2hi, f9hi);
++    const i16x8 a0lo = vec_add(a1lo, f9lo), a0hi = vec_add(a1hi, f9hi);
++    Update2Pixels(&p2, &q2, a2lo, a2hi);
++    Update2Pixels(&p1, &q1, a1lo, a1hi);
++    Update2Pixels(&p0, &q0, a0lo, a0hi);
++  }
++  *p2u = (u8x16)p2; *p1u = (u8x16)p1; *p0u = (u8x16)p0;
++  *q0u = (u8x16)q0; *q1u = (u8x16)q1; *q2u = (u8x16)q2;
++}
++
++static void VFilter16_VSX(uint8_t* p, int s, int thresh, int ithresh,
++                          int hev_thresh) {
++  u8x16 p3 = vec_xl(0, p - 4 * s), p2 = vec_xl(0, p - 3 * s);
++  u8x16 p1 = vec_xl(0, p - 2 * s), p0 = vec_xl(0, p - s);
++  u8x16 q0 = vec_xl(0, p), q1 = vec_xl(0, p + s);
++  u8x16 q2 = vec_xl(0, p + 2 * s), q3 = vec_xl(0, p + 3 * s);
++  const u8x16 m = ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
++  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, m, hev_thresh);
++  vec_xst(p2, 0, p - 3 * s); vec_xst(p1, 0, p - 2 * s); vec_xst(p0, 0, p - s);
++  vec_xst(q0, 0, p); vec_xst(q1, 0, p + s); vec_xst(q2, 0, p + 2 * s);
++}
++
++static void VFilter16i_VSX(uint8_t* p, int s, int thresh, int ithresh,
++                           int hev_thresh) {
++  int k;
++  for (k = 3; k > 0; --k) {
++    p += 4 * s;
++    u8x16 p3 = vec_xl(0, p - 4 * s), p2 = vec_xl(0, p - 3 * s);
++    u8x16 p1 = vec_xl(0, p - 2 * s), p0 = vec_xl(0, p - s);
++    u8x16 q0 = vec_xl(0, p), q1 = vec_xl(0, p + s);
++    u8x16 q2 = vec_xl(0, p + 2 * s), q3 = vec_xl(0, p + 3 * s);
++    const u8x16 m = ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
++    DoFilter4(&p1, &p0, &q0, &q1, m, hev_thresh);
++    vec_xst(p1, 0, p - 2 * s); vec_xst(p0, 0, p - s);
++    vec_xst(q0, 0, p); vec_xst(q1, 0, p + s);
++  }
++}
++
++// Complex horizontal luma: two 16x4 transposes around the vertical edge feed
++// the same DoFilter4/DoFilter6 used by the vertical variants.
++static void HFilter16_VSX(uint8_t* p, int s, int thresh, int ithresh,
++                          int hev_thresh) {
++  uint8_t* const b = p - 4;
++  u8x16 p3, p2, p1, p0, q0, q1, q2, q3;
++  Load16x4(b, b + 8 * s, s, &p3, &p2, &p1, &p0);
++  Load16x4(p, p + 8 * s, s, &q0, &q1, &q2, &q3);
++  const u8x16 m = ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
++  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, m, hev_thresh);
++  Store16x4(p3, p2, p1, p0, b, b + 8 * s, s);
++  Store16x4(q0, q1, q2, q3, p, p + 8 * s, s);
++}
++
++static void HFilter16i_VSX(uint8_t* p, int s, int thresh, int ithresh,
++                           int hev_thresh) {
++  int k;
++  for (k = 3; k > 0; --k) {
++    p += 4;
++    u8x16 p3, p2, p1, p0, q0, q1, q2, q3;
++    Load16x4(p - 4, p - 4 + 8 * s, s, &p3, &p2, &p1, &p0);
++    Load16x4(p, p + 8 * s, s, &q0, &q1, &q2, &q3);
++    const u8x16 m =
++        ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
++    DoFilter4(&p1, &p0, &q0, &q1, m, hev_thresh);
++    Store16x4(p1, p0, q0, q1, p - 2, p - 2 + 8 * s, s);
++  }
++}
++
++//------------------------------------------------------------------------------
++// Complex chroma filtering: operate on the u and v planes (8 wide) together.
++
++// Pack 8 u-bytes into the low half and 8 v-bytes into the high half.
++static WEBP_INLINE u8x16 LoadUV(const uint8_t* WEBP_RESTRICT u,
++                                const uint8_t* WEBP_RESTRICT v) {
++  unsigned char b[16];
++  memcpy(b, u, 8);
++  memcpy(b + 8, v, 8);
++  return vec_xl(0, b);
++}
++
++static WEBP_INLINE void StoreUV(u8x16 x, uint8_t* WEBP_RESTRICT u,
++                                uint8_t* WEBP_RESTRICT v) {
++  unsigned char b[16];
++  memcpy(b, &x, 16);
++  memcpy(u, b, 8);
++  memcpy(v, b + 8, 8);
++}
++
++static void VFilter8_VSX(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
++                         int s, int thresh, int ithresh, int hev_thresh) {
++  u8x16 p3 = LoadUV(u - 4 * s, v - 4 * s), p2 = LoadUV(u - 3 * s, v - 3 * s);
++  u8x16 p1 = LoadUV(u - 2 * s, v - 2 * s), p0 = LoadUV(u - s, v - s);
++  u8x16 q0 = LoadUV(u, v), q1 = LoadUV(u + s, v + s);
++  u8x16 q2 = LoadUV(u + 2 * s, v + 2 * s), q3 = LoadUV(u + 3 * s, v + 3 * s);
++  const u8x16 m = ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
++  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, m, hev_thresh);
++  StoreUV(p2, u - 3 * s, v - 3 * s); StoreUV(p1, u - 2 * s, v - 2 * s);
++  StoreUV(p0, u - s, v - s); StoreUV(q0, u, v);
++  StoreUV(q1, u + s, v + s); StoreUV(q2, u + 2 * s, v + 2 * s);
++}
++
++static void VFilter8i_VSX(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
++                          int s, int thresh, int ithresh, int hev_thresh) {
++  u += 4 * s; v += 4 * s;
++  u8x16 p3 = LoadUV(u - 4 * s, v - 4 * s), p2 = LoadUV(u - 3 * s, v - 3 * s);
++  u8x16 p1 = LoadUV(u - 2 * s, v - 2 * s), p0 = LoadUV(u - s, v - s);
++  u8x16 q0 = LoadUV(u, v), q1 = LoadUV(u + s, v + s);
++  u8x16 q2 = LoadUV(u + 2 * s, v + 2 * s), q3 = LoadUV(u + 3 * s, v + 3 * s);
++  const u8x16 m = ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
++  DoFilter4(&p1, &p0, &q0, &q1, m, hev_thresh);
++  StoreUV(p1, u - 2 * s, v - 2 * s); StoreUV(p0, u - s, v - s);
++  StoreUV(q0, u, v); StoreUV(q1, u + s, v + s);
++}
++
++static void HFilter8_VSX(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
++                         int s, int thresh, int ithresh, int hev_thresh) {
++  u8x16 p3, p2, p1, p0, q0, q1, q2, q3;
++  Load16x4(u - 4, v - 4, s, &p3, &p2, &p1, &p0);
++  Load16x4(u, v, s, &q0, &q1, &q2, &q3);
++  const u8x16 m = ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
++  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, m, hev_thresh);
++  Store16x4(p3, p2, p1, p0, u - 4, v - 4, s);
++  Store16x4(q0, q1, q2, q3, u, v, s);
++}
++
++static void HFilter8i_VSX(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
++                          int s, int thresh, int ithresh, int hev_thresh) {
++  u += 4; v += 4;
++  u8x16 p3, p2, p1, p0, q0, q1, q2, q3;
++  Load16x4(u - 4, v - 4, s, &p3, &p2, &p1, &p0);
++  Load16x4(u, v, s, &q0, &q1, &q2, &q3);
++  const u8x16 m = ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
++  DoFilter4(&p1, &p0, &q0, &q1, m, hev_thresh);
++  Store16x4(p1, p0, q0, q1, u - 2, v - 2, s);
++}
++
++//------------------------------------------------------------------------------
++// Intra prediction (16x16 luma, 8x8 chroma). DC top-sums are scalar (the SIMD
++// win is the block fill); TrueMotion/VE/HE are vectorized.
++
++static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
++  const u8x16 x = vec_splats(v);
++  int j;
++  for (j = 0; j < 16; ++j) vec_xst(x, 0, dst + j * BPS);
++}
++static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
++  const u8x16 x = vec_splats(v);
++  unsigned char b[16];
++  int j;
++  memcpy(b, &x, 16);
++  for (j = 0; j < 8; ++j) memcpy(dst + j * BPS, b, 8);
++}
++
++static void VE16_VSX(uint8_t* dst) {
++  const u8x16 top = vec_xl(0, dst - BPS);
++  int j;
++  for (j = 0; j < 16; ++j) vec_xst(top, 0, dst + j * BPS);
++}
++static void HE16_VSX(uint8_t* dst) {
++  int j;
++  for (j = 0; j < 16; ++j) vec_xst(vec_splats(dst[-1 + j * BPS]), 0, dst + j * BPS);
++}
++static void DC16_VSX(uint8_t* dst) {
++  int s = 16, j;
++  for (j = 0; j < 16; ++j) s += dst[-BPS + j] + dst[-1 + j * BPS];
++  Put16(s >> 5, dst);
++}
++static void DC16NoTop_VSX(uint8_t* dst) {
++  int s = 8, j;
++  for (j = 0; j < 16; ++j) s += dst[-1 + j * BPS];
++  Put16(s >> 4, dst);
++}
++static void DC16NoLeft_VSX(uint8_t* dst) {
++  int s = 8, j;
++  for (j = 0; j < 16; ++j) s += dst[-BPS + j];
++  Put16(s >> 4, dst);
++}
++static void DC16NoTopLeft_VSX(uint8_t* dst) { Put16(0x80, dst); }
++static void TM16_VSX(uint8_t* dst) {
++  const u8x16 zero = vec_splats((unsigned char)0);
++  const u8x16 t = vec_xl(0, dst - BPS);
++  const i16x8 tl = (i16x8)vec_mergeh(t, zero), th = (i16x8)vec_mergel(t, zero);
++  const int c = dst[-BPS - 1];
++  int y;
++  for (y = 0; y < 16; ++y) {
++    const i16x8 b = vec_splats((short)(dst[-1 + y * BPS] - c));
++    vec_xst((u8x16)vec_packsu(vec_add(b, tl), vec_add(b, th)), 0, dst + y * BPS);
++  }
++}
++
++static void VE8uv_VSX(uint8_t* dst) {
++  unsigned char t[8];
++  int j;
++  memcpy(t, dst - BPS, 8);
++  for (j = 0; j < 8; ++j) memcpy(dst + j * BPS, t, 8);
++}
++static void DC8uv_VSX(uint8_t* dst) {
++  int s = 8, j;
++  for (j = 0; j < 8; ++j) s += dst[-BPS + j] + dst[-1 + j * BPS];
++  Put8x8uv(s >> 4, dst);
++}
++static void DC8uvNoTop_VSX(uint8_t* dst) {
++  int s = 4, j;
++  for (j = 0; j < 8; ++j) s += dst[-1 + j * BPS];
++  Put8x8uv(s >> 3, dst);
++}
++static void DC8uvNoLeft_VSX(uint8_t* dst) {
++  int s = 4, j;
++  for (j = 0; j < 8; ++j) s += dst[-BPS + j];
++  Put8x8uv(s >> 3, dst);
++}
++static void DC8uvNoTopLeft_VSX(uint8_t* dst) { Put8x8uv(0x80, dst); }
++static void TM8uv_VSX(uint8_t* dst) {
++  const u8x16 zero = vec_splats((unsigned char)0);
++  const u8x16 t = vec_xl(0, dst - BPS);
++  const i16x8 tl = (i16x8)vec_mergeh(t, zero);
++  const int c = dst[-BPS - 1];
++  int y;
++  for (y = 0; y < 8; ++y) {
++    const i16x8 b = vec_splats((short)(dst[-1 + y * BPS] - c));
++    const u8x16 o = (u8x16)vec_packsu(vec_add(b, tl), vec_splats((short)0));
++    unsigned char bb[16];
++    memcpy(bb, &o, 16);
++    memcpy(dst + y * BPS, bb, 8);
++  }
++}
++
++//------------------------------------------------------------------------------
++// 4x4 luma intra prediction. Whole-vector byte shifts window the edge samples:
++//   srli_si128(x,n) == vec_sld(zero, x, 16 - n)
++//   slli_si128(x,n) == vec_sld(x, zero, n)
++
++#define SRLI(x, n) vec_sld(zero, (x), 16 - (n))
++#define SLLI(x, n) vec_sld((x), zero, (n))
++#define INS16(v, val, i) ((u8x16)vec_insert((short)(val), (i16x8)(v), (i)))
++#define AVG3C(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
++
++static WEBP_INLINE u8x16 Load64(const uint8_t* WEBP_RESTRICT p) {
++  unsigned char b[16] = {0};
++  memcpy(b, p, 8);
++  return vec_xl(0, b);
++}
++static WEBP_INLINE uint32_t GetWord(u8x16 v) {
++  unsigned char b[16];
++  uint32_t r;
++  memcpy(b, &v, 16);
++  memcpy(&r, b, 4);
++  return r;
++}
++static WEBP_INLINE u8x16 SetWord(uint32_t v) {
++  unsigned char b[16] = {0};
++  memcpy(b, &v, 4);
++  return vec_xl(0, b);
++}
++static WEBP_INLINE void StoreWord(uint32_t v, uint8_t* dst) {
++  memcpy(dst, &v, 4);
++}
++
++static void VE4_VSX(uint8_t* dst) {
++  const u8x16 zero = vec_splats((unsigned char)0), one = vec_splats((unsigned char)1);
++  const u8x16 A = Load64(dst - BPS - 1), B = SRLI(A, 1), C = SRLI(A, 2);
++  const u8x16 a = vec_avg(A, C), lsb = vec_and(vec_xor(A, C), one);
++  const u8x16 avg = vec_avg(vec_subs(a, lsb), B);
++  const uint32_t v = GetWord(avg);
++  int i;
++  for (i = 0; i < 4; ++i) StoreWord(v, dst + i * BPS);
++}
++static void LD4_VSX(uint8_t* dst) {
++  const u8x16 zero = vec_splats((unsigned char)0), one = vec_splats((unsigned char)1);
++  const u8x16 A = Load64(dst - BPS), B = SRLI(A, 1), C = SRLI(A, 2);
++  const u8x16 CH = INS16(C, dst[-BPS + 7], 3);
++  const u8x16 a1 = vec_avg(A, CH), lsb = vec_and(vec_xor(A, CH), one);
++  const u8x16 r = vec_avg(vec_subs(a1, lsb), B);
++  StoreWord(GetWord(r), dst + 0 * BPS);
++  StoreWord(GetWord(SRLI(r, 1)), dst + 1 * BPS);
++  StoreWord(GetWord(SRLI(r, 2)), dst + 2 * BPS);
++  StoreWord(GetWord(SRLI(r, 3)), dst + 3 * BPS);
++}
++static void VR4_VSX(uint8_t* dst) {
++  const u8x16 zero = vec_splats((unsigned char)0), one = vec_splats((unsigned char)1);
++  const int I = dst[-1 + 0 * BPS], J = dst[-1 + 1 * BPS], K = dst[-1 + 2 * BPS];
++  const int X = dst[-1 - BPS];
++  const u8x16 XA = Load64(dst - BPS - 1), A0 = SRLI(XA, 1);
++  const u8x16 abcd = vec_avg(XA, A0);
++  const u8x16 IX = INS16(SLLI(XA, 1), (I | (X << 8)), 0);
++  const u8x16 a1 = vec_avg(IX, A0), lsb = vec_and(vec_xor(IX, A0), one);
++  const u8x16 efgh = vec_avg(vec_subs(a1, lsb), XA);
++  StoreWord(GetWord(abcd), dst + 0 * BPS);
++  StoreWord(GetWord(efgh), dst + 1 * BPS);
++  StoreWord(GetWord(SLLI(abcd, 1)), dst + 2 * BPS);
++  StoreWord(GetWord(SLLI(efgh, 1)), dst + 3 * BPS);
++  dst[0 + 2 * BPS] = AVG3C(J, I, X);
++  dst[0 + 3 * BPS] = AVG3C(K, J, I);
++}
++static void VL4_VSX(uint8_t* dst) {
++  const u8x16 zero = vec_splats((unsigned char)0), one = vec_splats((unsigned char)1);
++  const u8x16 A = Load64(dst - BPS), B = SRLI(A, 1), C = SRLI(A, 2);
++  const u8x16 a1 = vec_avg(A, B), a2 = vec_avg(C, B), a3 = vec_avg(a1, a2);
++  const u8x16 lsb1 = vec_and(vec_xor(a1, a2), one);
++  const u8x16 abbc = vec_or(vec_xor(A, B), vec_xor(C, B));
++  const u8x16 a4 = vec_subs(a3, vec_and(abbc, lsb1));
++  const uint32_t extra = GetWord(SRLI(a4, 4));
++  StoreWord(GetWord(a1), dst + 0 * BPS);
++  StoreWord(GetWord(a4), dst + 1 * BPS);
++  StoreWord(GetWord(SRLI(a1, 1)), dst + 2 * BPS);
++  StoreWord(GetWord(SRLI(a4, 1)), dst + 3 * BPS);
++  dst[3 + 2 * BPS] = (extra >> 0) & 0xff;
++  dst[3 + 3 * BPS] = (extra >> 8) & 0xff;
++}
++static void RD4_VSX(uint8_t* dst) {
++  const u8x16 zero = vec_splats((unsigned char)0), one = vec_splats((unsigned char)1);
++  const uint32_t I = dst[-1 + 0 * BPS], J = dst[-1 + 1 * BPS];
++  const uint32_t K = dst[-1 + 2 * BPS], L = dst[-1 + 3 * BPS];
++  const u8x16 XA = Load64(dst - BPS - 1);
++  const u8x16 all = vec_or(SetWord((uint32_t)(L | (K << 8) | (J << 16) | (I << 24))),
++                           SLLI(XA, 4));
++  const u8x16 k1 = SRLI(all, 1), j2 = SRLI(all, 2);
++  const u8x16 a1 = vec_avg(j2, all), lsb = vec_and(vec_xor(j2, all), one);
++  const u8x16 r = vec_avg(vec_subs(a1, lsb), k1);
++  StoreWord(GetWord(r), dst + 3 * BPS);
++  StoreWord(GetWord(SRLI(r, 1)), dst + 2 * BPS);
++  StoreWord(GetWord(SRLI(r, 2)), dst + 1 * BPS);
++  StoreWord(GetWord(SRLI(r, 3)), dst + 0 * BPS);
++}
++static void TM4_VSX(uint8_t* dst) {
++  const u8x16 zero = vec_splats((unsigned char)0);
++  const u8x16 t = Load64(dst - BPS);
++  const i16x8 tb = (i16x8)vec_mergeh(t, zero);
++  const int c = dst[-BPS - 1];
++  int y;
++  for (y = 0; y < 4; ++y) {
++    const i16x8 b = vec_splats((short)(dst[-1 + y * BPS] - c));
++    const u8x16 o = (u8x16)vec_packsu(vec_add(b, tb), vec_splats((short)0));
++    StoreWord(GetWord(o), dst + y * BPS);
++  }
++}
++#undef SRLI
++#undef SLLI
++#undef INS16
++#undef AVG3C
++
++extern void VP8DspInitVSX(void);
++
++WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitVSX(void) {
++  VP8Transform = Transform_VSX;
++  VP8SimpleVFilter16 = SimpleVFilter16_VSX;
++  VP8SimpleVFilter16i = SimpleVFilter16i_VSX;
++  VP8SimpleHFilter16 = SimpleHFilter16_VSX;
++  VP8SimpleHFilter16i = SimpleHFilter16i_VSX;
++  VP8VFilter16 = VFilter16_VSX;
++  VP8VFilter16i = VFilter16i_VSX;
++  VP8HFilter16 = HFilter16_VSX;
++  VP8HFilter16i = HFilter16i_VSX;
++  VP8VFilter8 = VFilter8_VSX;
++  VP8VFilter8i = VFilter8i_VSX;
++  VP8HFilter8 = HFilter8_VSX;
++  VP8HFilter8i = HFilter8i_VSX;
++
++  VP8PredLuma16[0] = DC16_VSX;
++  VP8PredLuma16[1] = TM16_VSX;
++  VP8PredLuma16[2] = VE16_VSX;
++  VP8PredLuma16[3] = HE16_VSX;
++  VP8PredLuma16[4] = DC16NoTop_VSX;
++  VP8PredLuma16[5] = DC16NoLeft_VSX;
++  VP8PredLuma16[6] = DC16NoTopLeft_VSX;
++  VP8PredChroma8[0] = DC8uv_VSX;
++  VP8PredChroma8[1] = TM8uv_VSX;
++  VP8PredChroma8[2] = VE8uv_VSX;
++  VP8PredChroma8[4] = DC8uvNoTop_VSX;
++  VP8PredChroma8[5] = DC8uvNoLeft_VSX;
++  VP8PredChroma8[6] = DC8uvNoTopLeft_VSX;
++  VP8PredLuma4[1] = TM4_VSX;
++  VP8PredLuma4[2] = VE4_VSX;
++  VP8PredLuma4[4] = RD4_VSX;
++  VP8PredLuma4[5] = VR4_VSX;
++  VP8PredLuma4[6] = LD4_VSX;
++  VP8PredLuma4[7] = VL4_VSX;
++}
++
++#else  // !WEBP_USE_VSX
++
++WEBP_DSP_INIT_STUB(VP8DspInitVSX)
++
++#endif  // WEBP_USE_VSX
+diff --git a/media/libwebp/src/dsp/filters.c b/media/libwebp/src/dsp/filters.c
+index 38da5252df3a..9962e1287402 100644
+--- a/media/libwebp/src/dsp/filters.c
++++ b/media/libwebp/src/dsp/filters.c
+@@ -217,6 +217,7 @@ extern void VP8FiltersInitMIPSdspR2(void);
+ extern void VP8FiltersInitMSA(void);
+ extern void VP8FiltersInitNEON(void);
+ extern void VP8FiltersInitSSE2(void);
++extern void VP8FiltersInitVSX(void);
+ 
+ WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
+   WebPUnfilters[WEBP_FILTER_NONE] = NoneUnfilter_C;
+@@ -248,6 +249,11 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
+     if (VP8GetCPUInfo(kMSA)) {
+       VP8FiltersInitMSA();
+     }
++#endif
++#if defined(WEBP_HAVE_VSX)
++    if (VP8GetCPUInfo(kVSX)) {
++      VP8FiltersInitVSX();
++    }
+ #endif
+   }
+ 
+diff --git a/media/libwebp/src/dsp/filters_vsx.c b/media/libwebp/src/dsp/filters_vsx.c
+new file mode 100644
+index 000000000000..ae8e57ac685c
+--- /dev/null
++++ b/media/libwebp/src/dsp/filters_vsx.c
+@@ -0,0 +1,162 @@
++// Copyright 2015 Google Inc. All Rights Reserved.
++//
++// Use of this source code is governed by a BSD-style license
++// that can be found in the COPYING file in the root of the source
++// tree. An additional intellectual property rights grant can be found
++// in the file PATENTS. All contributing project authors may
++// be found in the AUTHORS file in the root of the source tree.
++// -----------------------------------------------------------------------------
++//
++// VSX (PowerPC) version of filtering functions.
++
++#include "src/dsp/dsp.h"
++
++#if defined(WEBP_USE_VSX)
++
++#include <altivec.h>
++#include <assert.h>
++#include <string.h>
++
++#include "src/dsp/cpu.h"
++#include "src/webp/types.h"
++
++typedef __vector unsigned char u8x16;
++typedef __vector unsigned short u16x8;
++typedef __vector signed short i16x8;
++typedef __vector unsigned long long u64x2;
++
++// Byte-wise shifts of the whole 128-bit register, matching the little-endian
++// semantics of _mm_slli_si128 / _mm_srli_si128. 'n' must be a literal.
++#define SLLI(x, n) vec_sld((x), zero, (n))
++#define SRLI(x, n) vec_sld(zero, (x), 16 - (n))
++
++// Loads 8 bytes from 'p' into the low half of a vector (high half undefined).
++static WEBP_INLINE u8x16 Load8(const uint8_t* p) {
++  uint64_t v;
++  memcpy(&v, p, 8);
++  return (u8x16)vec_splats(v);
++}
++
++//------------------------------------------------------------------------------
++// Horizontal unfilter: out[i] = in[i] + out[i - 1] (a prefix sum).
++
++static void HorizontalUnfilter_VSX(const uint8_t* prev, const uint8_t* in,
++                                   uint8_t* out, int width) {
++  const u8x16 zero = vec_splats((unsigned char)0);
++  const u64x2 sh56 = vec_splats((unsigned long long)56);
++  u8x16 last;
++  int i;
++  out[0] = (uint8_t)(in[0] + (prev == NULL ? 0 : prev[0]));
++  if (width <= 1) return;
++  last = vec_insert(out[0], zero, 0);
++  for (i = 1; i + 8 <= width; i += 8) {
++    const u8x16 A0 = Load8(in + i);
++    const u8x16 A1 = vec_add(A0, last);
++    const u8x16 A2 = SLLI(A1, 1);
++    const u8x16 A3 = vec_add(A1, A2);
++    const u8x16 A4 = SLLI(A3, 2);
++    const u8x16 A5 = vec_add(A3, A4);
++    const u8x16 A6 = SLLI(A5, 4);
++    const u8x16 A7 = vec_add(A5, A6);
++    memcpy(out + i, &A7, 8);
++    last = (u8x16)vec_sr((u64x2)A7, sh56);  // broadcast out[i + 7] to byte 0
++  }
++  for (; i < width; ++i) out[i] = (uint8_t)(in[i] + out[i - 1]);
++}
++
++//------------------------------------------------------------------------------
++// Vertical unfilter: out[i] = in[i] + prev[i].
++
++static void VerticalUnfilter_VSX(const uint8_t* prev, const uint8_t* in,
++                                 uint8_t* out, int width) {
++  if (prev == NULL) {
++    HorizontalUnfilter_VSX(NULL, in, out, width);
++  } else {
++    int i;
++    const int max_pos = width & ~31;
++    for (i = 0; i < max_pos; i += 32) {
++      const u8x16 A0 = vec_xl(0, (unsigned char*)&in[i + 0]);
++      const u8x16 A1 = vec_xl(0, (unsigned char*)&in[i + 16]);
++      const u8x16 B0 = vec_xl(0, (unsigned char*)&prev[i + 0]);
++      const u8x16 B1 = vec_xl(0, (unsigned char*)&prev[i + 16]);
++      vec_xst(vec_add(A0, B0), 0, (unsigned char*)&out[i + 0]);
++      vec_xst(vec_add(A1, B1), 0, (unsigned char*)&out[i + 16]);
++    }
++    for (; i < width; ++i) out[i] = (uint8_t)(in[i] + prev[i]);
++  }
++}
++
++//------------------------------------------------------------------------------
++// Gradient unfilter: row[i] = in[i] + clip(row[i-1] + top[i] - top[i-1]).
++
++static WEBP_INLINE int GradientPredictor_VSX(uint8_t a, uint8_t b, uint8_t c) {
++  const int g = a + b - c;
++  return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;
++}
++
++static void GradientPredictInverse_VSX(const uint8_t* in, const uint8_t* top,
++                                       uint8_t* row, int length) {
++  if (length > 0) {
++    int i;
++    const int max_pos = length & ~7;
++    const u8x16 zero = vec_splats((unsigned char)0);
++    u8x16 A = vec_insert((unsigned char)row[-1], zero, 0);  // left sample
++    for (i = 0; i < max_pos; i += 8) {
++      const u8x16 t0 = Load8(top + i);
++      const u8x16 t1 = Load8(top + i - 1);
++      const u16x8 B = (u16x8)vec_mergeh(t0, zero);
++      const u16x8 C = (u16x8)vec_mergeh(t1, zero);
++      const u8x16 D = Load8(in + i);  // base input
++      const u16x8 E = vec_sub(B, C);  // unclipped gradient basis b - c
++      u8x16 out = zero;               // accumulator for output
++      u8x16 mask_hi = vec_insert((unsigned char)0xff, zero, 0);
++      int k = 8;
++      while (1) {
++        const u16x8 tmp3 = vec_add((u16x8)A, E);  // delta = a + b - c
++        const u8x16 tmp4 = vec_packsu((i16x8)tmp3, (i16x8)zero);  // sat. delta
++        const u8x16 tmp5 = vec_add(tmp4, D);                      // add to in[]
++        A = vec_and(tmp5, mask_hi);  // keep new sample
++        out = vec_or(out, A);        // accumulate output
++        if (--k == 0) break;
++        A = SLLI(A, 1);                  // rotate left sample
++        mask_hi = SLLI(mask_hi, 1);      // rotate mask
++        A = (u8x16)vec_mergeh(A, zero);  // convert 8b -> 16b
++      }
++      A = SRLI(A, 7);  // prepare left sample for next iteration
++      memcpy(row + i, &out, 8);
++    }
++    for (; i < length; ++i) {
++      const int delta = GradientPredictor_VSX(row[i - 1], top[i], top[i - 1]);
++      row[i] = (uint8_t)(in[i] + delta);
++    }
++  }
++}
++
++static void GradientUnfilter_VSX(const uint8_t* prev, const uint8_t* in,
++                                 uint8_t* out, int width) {
++  if (prev == NULL) {
++    HorizontalUnfilter_VSX(NULL, in, out, width);
++  } else {
++    out[0] = (uint8_t)(in[0] + prev[0]);  // predict from above
++    GradientPredictInverse_VSX(in + 1, prev + 1, out + 1, width - 1);
++  }
++}
++
++#undef SLLI
++#undef SRLI
++
++//------------------------------------------------------------------------------
++
++extern void VP8FiltersInitVSX(void);
++
++WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitVSX(void) {
++  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_VSX;
++  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_VSX;
++  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_VSX;
++}
++
++#else  // !WEBP_USE_VSX
++
++WEBP_DSP_INIT_STUB(VP8FiltersInitVSX)
++
++#endif  // WEBP_USE_VSX
+diff --git a/media/libwebp/src/dsp/lossless.c b/media/libwebp/src/dsp/lossless.c
+index 1a3d800c3fbc..48b5d4a3aedc 100644
+--- a/media/libwebp/src/dsp/lossless.c
++++ b/media/libwebp/src/dsp/lossless.c
+@@ -606,6 +606,7 @@ extern void VP8LDspInitAVX2(void);
+ extern void VP8LDspInitNEON(void);
+ extern void VP8LDspInitMIPSdspR2(void);
+ extern void VP8LDspInitMSA(void);
++extern void VP8LDspInitVSX(void);
+ 
+ #define COPY_PREDICTOR_ARRAY(IN, OUT) do {                \
+   (OUT)[0] = IN##0_C;                                     \
+@@ -673,6 +674,11 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) {
+     if (VP8GetCPUInfo(kMSA)) {
+       VP8LDspInitMSA();
+     }
++#endif
++#if defined(WEBP_HAVE_VSX)
++    if (VP8GetCPUInfo(kVSX)) {
++      VP8LDspInitVSX();
++    }
+ #endif
+   }
+ 
+diff --git a/media/libwebp/src/dsp/lossless_vsx.c b/media/libwebp/src/dsp/lossless_vsx.c
+new file mode 100644
+index 000000000000..89da30c9589c
+--- /dev/null
++++ b/media/libwebp/src/dsp/lossless_vsx.c
+@@ -0,0 +1,449 @@
++// Copyright 2014 Google Inc. All Rights Reserved.
++//
++// Use of this source code is governed by a BSD-style license
++// that can be found in the COPYING file in the root of the source
++// tree. An additional intellectual property rights grant can be found
++// in the file PATENTS. All contributing project authors may
++// be found in the AUTHORS file in the root of the source tree.
++// -----------------------------------------------------------------------------
++//
++// VSX (PowerPC) version of lossless functions.
++
++#include "src/dsp/dsp.h"
++
++#if defined(WEBP_USE_VSX)
++
++#include <altivec.h>
++#include <string.h>
++
++#include "src/dsp/cpu.h"
++#include "src/dsp/lossless.h"
++#include "src/dsp/lossless_common.h"
++#include "src/webp/format_constants.h"
++#include "src/webp/types.h"
++
++typedef __vector unsigned char u8x16;
++typedef __vector unsigned short u16x8;
++typedef __vector signed short i16x8;
++typedef __vector unsigned int u32x4;
++typedef __vector signed int i32x4;
++
++// Signed multiply-high of 16-bit lanes: (a * b) >> 16, matching
++// _mm_mulhi_epi16.
++static WEBP_INLINE i16x8 MulHiS16(i16x8 a, i16x8 b) {
++  const u32x4 sh = vec_splats((unsigned int)16);
++  const i32x4 e = vec_sra(vec_mule(a, b), sh);
++  const i32x4 o = vec_sra(vec_mulo(a, b), sh);
++  return (i16x8)vec_pack(vec_mergeh(e, o), vec_mergel(e, o));
++}
++
++//------------------------------------------------------------------------------
++// Color transforms.
++
++static void AddGreenToBlueAndRed_VSX(const uint32_t* src, int num_pixels,
++                                     uint32_t* dst) {
++  const u8x16 zero = vec_splats((unsigned char)0);
++  // Replicate the green byte (offset 1 of each pixel) into the blue/red slots.
++  const u8x16 kSpreadGreen = {1, 16, 1, 16, 5,  16, 5,  16,
++                              9, 16, 9, 16, 13, 16, 13, 16};
++  int i;
++  for (i = 0; i + 4 <= num_pixels; i += 4) {
++    const u8x16 in = (u8x16)vec_xl(0, (uint32_t*)&src[i]);
++    const u8x16 g = vec_perm(in, zero, kSpreadGreen);  // 0 g 0 g per pixel
++    vec_xst((u32x4)vec_add(in, g), 0, &dst[i]);
++  }
++  if (i != num_pixels) {
++    VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i);
++  }
++}
++
++static void TransformColorInverse_VSX(const VP8LMultipliers* const m,
++                                      const uint32_t* src, int num_pixels,
++                                      uint32_t* dst) {
++// sign-extended multiplying constants, pre-shifted by 5 (see lossless_sse2.c).
++#define CST(X) (((int16_t)((m->X) << 8)) >> 5)
++  const i16x8 mults_rb =
++      (i16x8)vec_splats((int)(((uint32_t)(uint16_t)CST(green_to_red) << 16) |
++                              ((uint16_t)CST(green_to_blue))));
++  const i16x8 mults_b2 =
++      (i16x8)vec_splats((int)((uint32_t)(uint16_t)CST(red_to_blue) << 16));
++#undef CST
++  const u8x16 zero = vec_splats((unsigned char)0);
++  const u32x4 mask_ag = vec_splats((uint32_t)0xff00ff00);  // alpha/green
++  const u16x8 sh8_16 = vec_splats((unsigned short)8);
++  const u32x4 sh8_32 = vec_splats((unsigned int)8);
++  // Broadcast the green byte (offset 1) into the high byte of both 16-bit
++  // halves of each pixel: yields g << 8 in each lane.
++  const u8x16 kGreenHi = {16, 1, 16, 1, 16, 5,  16, 5,
++                          16, 9, 16, 9, 16, 13, 16, 13};
++  int i;
++  for (i = 0; i + 4 <= num_pixels; i += 4) {
++    const u8x16 in = (u8x16)vec_xl(0, (uint32_t*)&src[i]);
++    const u8x16 A = (u8x16)vec_and((u32x4)in, mask_ag);   // a 0 g 0
++    const i16x8 C = (i16x8)vec_perm(A, zero, kGreenHi);   // g0g0 (g << 8)
++    const u8x16 D = (u8x16)MulHiS16(C, mults_rb);         // x dr x db1
++    const u8x16 E = vec_add(in, D);                       // x r' x b'
++    const u16x8 F = vec_sl((u16x8)E, sh8_16);             // r' 0 b' 0
++    const u8x16 G = (u8x16)MulHiS16((i16x8)F, mults_b2);  // x db2 0 0
++    const u8x16 H = (u8x16)vec_sr((u32x4)G, sh8_32);      // 0 x db2 0
++    const u16x8 I = (u16x8)vec_add(H, (u8x16)F);          // r' x b'' 0
++    const u8x16 J = (u8x16)vec_sr(I, sh8_16);             // 0 r' 0 b''
++    vec_xst(vec_or((u32x4)J, (u32x4)A), 0, &dst[i]);
++  }
++  if (i != num_pixels) {
++    VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
++  }
++}
++
++//------------------------------------------------------------------------------
++// Color-space conversion functions.
++
++static void ConvertBGRAToRGBA_VSX(const uint32_t* WEBP_RESTRICT src,
++                                  int num_pixels, uint8_t* WEBP_RESTRICT dst) {
++  // Swap the blue (offset 0) and red (offset 2) bytes of each pixel.
++  const u8x16 kSwapBR = {2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
++  int i;
++  for (i = 0; i + 4 <= num_pixels; i += 4) {
++    const u8x16 in = (u8x16)vec_xl(0, (uint32_t*)&src[i]);
++    vec_xst(vec_perm(in, in, kSwapBR), 0, &dst[4 * i]);
++  }
++  if (i != num_pixels) {
++    VP8LConvertBGRAToRGBA_C(src + i, num_pixels - i, dst + 4 * i);
++  }
++}
++
++static void ConvertBGRAToRGB_VSX(const uint32_t* WEBP_RESTRICT src,
++                                 int num_pixels, uint8_t* WEBP_RESTRICT dst) {
++  // BGRA -> RGB: gather R,G,B (offsets 2,1,0) of each pixel, drop alpha.
++  const u8x16 kToRGB = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 0, 0, 0, 0};
++  int i;
++  for (i = 0; i + 4 <= num_pixels; i += 4) {
++    const u8x16 in = (u8x16)vec_xl(0, (uint32_t*)&src[i]);
++    const u8x16 out = vec_perm(in, in, kToRGB);
++    memcpy(&dst[3 * i], &out, 12);
++  }
++  if (i != num_pixels) {
++    VP8LConvertBGRAToRGB_C(src + i, num_pixels - i, dst + 3 * i);
++  }
++}
++
++static void ConvertBGRAToBGR_VSX(const uint32_t* WEBP_RESTRICT src,
++                                 int num_pixels, uint8_t* WEBP_RESTRICT dst) {
++  // BGRA -> BGR: gather B,G,R (offsets 0,1,2) of each pixel, drop alpha.
++  const u8x16 kToBGR = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0, 0, 0, 0};
++  int i;
++  for (i = 0; i + 4 <= num_pixels; i += 4) {
++    const u8x16 in = (u8x16)vec_xl(0, (uint32_t*)&src[i]);
++    const u8x16 out = vec_perm(in, in, kToBGR);
++    memcpy(&dst[3 * i], &out, 12);
++  }
++  if (i != num_pixels) {
++    VP8LConvertBGRAToBGR_C(src + i, num_pixels - i, dst + 3 * i);
++  }
++}
++
++//------------------------------------------------------------------------------
++// Predictor transform.
++
++// Byte-wise shifts of the whole register (little-endian _mm_s{l,r}li_si128).
++#define SLLI(x, n) vec_sld((x), kZero, (n))
++#define SRLI(x, n) vec_sld(kZero, (x), 16 - (n))
++static const u8x16 kZero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
++
++// Per-byte floor average (a + b) >> 1, matching the C Average2().
++static WEBP_INLINE u8x16 Average2_u8(u8x16 a, u8x16 b) {
++  const u8x16 one = vec_splats((unsigned char)1);
++  const u8x16 avg1 = vec_avg(a, b);  // (a + b + 1) >> 1
++  return vec_sub(avg1, vec_and(vec_xor(a, b), one));
++}
++
++static WEBP_INLINE u32x4 Lane0(uint32_t v) {
++  const u32x4 r = {v, 0, 0, 0};
++  return r;
++}
++
++// Single-pixel helpers operating on the low 32-bit lane only.
++static WEBP_INLINE u16x8 Unpack16(uint32_t a) {
++  return (u16x8)vec_mergeh((u8x16)Lane0(a), kZero);
++}
++
++static WEBP_INLINE uint32_t Average2_VSX(uint32_t a0, uint32_t a1) {
++  return vec_extract((u32x4)Average2_u8((u8x16)Lane0(a0), (u8x16)Lane0(a1)), 0);
++}
++
++static WEBP_INLINE u16x8 Average2_16(uint32_t a0, uint32_t a1) {
++  const u16x8 one = vec_splats((unsigned short)1);
++  return vec_sr(vec_add(Unpack16(a0), Unpack16(a1)), one);
++}
++
++static WEBP_INLINE uint32_t Average3_VSX(uint32_t a0, uint32_t a1,
++                                         uint32_t a2) {
++  const u16x8 one = vec_splats((unsigned short)1);
++  const u16x8 avg1 = Average2_16(a0, a2);
++  const u16x8 avg2 = vec_sr(vec_add(avg1, Unpack16(a1)), one);
++  return vec_extract((u32x4)vec_packsu((i16x8)avg2, (i16x8)avg2), 0);
++}
++
++static WEBP_INLINE uint32_t Average4_VSX(uint32_t a0, uint32_t a1, uint32_t a2,
++                                         uint32_t a3) {
++  const u16x8 one = vec_splats((unsigned short)1);
++  const u16x8 avg1 = Average2_16(a0, a1);
++  const u16x8 avg2 = Average2_16(a2, a3);
++  const u16x8 avg3 = vec_sr(vec_add(avg1, avg2), one);
++  return vec_extract((u32x4)vec_packsu((i16x8)avg3, (i16x8)avg3), 0);
++}
++
++static WEBP_INLINE uint32_t ClampedAddSubtractFull_VSX(uint32_t c0, uint32_t c1,
++                                                       uint32_t c2) {
++  const i16x8 v =
++      vec_sub((i16x8)vec_add(Unpack16(c0), Unpack16(c1)), (i16x8)Unpack16(c2));
++  return vec_extract((u32x4)vec_packsu(v, v), 0);
++}
++
++static WEBP_INLINE uint32_t ClampedAddSubtractHalf_VSX(uint32_t c0, uint32_t c1,
++                                                       uint32_t c2) {
++  const u16x8 one = vec_splats((unsigned short)1);
++  const u16x8 C0 = Unpack16(c0);
++  const u16x8 C1 = Unpack16(c1);
++  const u16x8 B0 = Unpack16(c2);
++  const u16x8 A0 = vec_sr(vec_add(C1, C0), one);  // ave
++  const i16x8 A1 = vec_sub((i16x8)A0, (i16x8)B0);
++  const i16x8 BgtA = (i16x8)vec_cmpgt(B0, A0);  // 0 or -1
++  const i16x8 A2 = vec_sub(A1, BgtA);
++  const i16x8 A3 = vec_sra(A2, one);
++  const i16x8 A4 = vec_add((i16x8)A0, A3);
++  return vec_extract((u32x4)vec_packsu(A4, A4), 0);
++}
++
++static WEBP_INLINE uint32_t Select_VSX(uint32_t a, uint32_t b, uint32_t c) {
++  const u8x16 A = (u8x16)Lane0(a);
++  const u8x16 B = (u8x16)Lane0(b);
++  const u8x16 C = (u8x16)Lane0(c);
++  const u32x4 sa = vec_sum4s(vec_or(vec_subs(A, C), vec_subs(C, A)),
++                             vec_splats((unsigned int)0));
++  const u32x4 sb = vec_sum4s(vec_or(vec_subs(B, C), vec_subs(C, B)),
++                             vec_splats((unsigned int)0));
++  return vec_extract((u32x4)vec_cmpgt(sb, sa), 0) ? b : a;
++}
++
++static uint32_t Predictor5_VSX(const uint32_t* const left,
++                               const uint32_t* const top) {
++  return Average3_VSX(*left, top[0], top[1]);
++}
++static uint32_t Predictor6_VSX(const uint32_t* const left,
++                               const uint32_t* const top) {
++  return Average2_VSX(*left, top[-1]);
++}
++static uint32_t Predictor7_VSX(const uint32_t* const left,
++                               const uint32_t* const top) {
++  return Average2_VSX(*left, top[0]);
++}
++static uint32_t Predictor13_VSX(const uint32_t* const left,
++                                const uint32_t* const top) {
++  return ClampedAddSubtractHalf_VSX(*left, top[0], top[-1]);
++}
++
++static void PredictorAdd0_VSX(const uint32_t* in, const uint32_t* upper,
++                              int num_pixels, uint32_t* WEBP_RESTRICT out) {
++  const u8x16 black = (u8x16)vec_splats((uint32_t)ARGB_BLACK);
++  int i;
++  for (i = 0; i + 4 <= num_pixels; i += 4) {
++    const u8x16 src = (u8x16)vec_xl(0, (uint32_t*)&in[i]);
++    vec_xst((u32x4)vec_add(src, black), 0, &out[i]);
++  }
++  if (i != num_pixels) {
++    VP8LPredictorsAdd_C[0](in + i, NULL, num_pixels - i, out + i);
++  }
++  (void)upper;
++}
++
++static void PredictorAdd1_VSX(const uint32_t* in, const uint32_t* upper,
++                              int num_pixels, uint32_t* WEBP_RESTRICT out) {
++  u32x4 prev = vec_splats(out[-1]);
++  int i;
++  for (i = 0; i + 4 <= num_pixels; i += 4) {
++    const u8x16 src = (u8x16)vec_xl(0, (uint32_t*)&in[i]);
++    const u8x16 sum0 = vec_add(src, SLLI(src, 4));    // a | a+b | b+c | c+d
++    const u8x16 sum1 = vec_add(sum0, SLLI(sum0, 8));  // running sum
++    const u8x16 res = vec_add(sum1, (u8x16)prev);
++    vec_xst((u32x4)res, 0, &out[i]);
++    prev = vec_splat((u32x4)res, 3);  // replicate last pixel
++  }
++  if (i != num_pixels) {
++    VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);
++  }
++}
++
++#define GENERATE_PREDICTOR_1_VSX(X, IN)                                        \
++  static void PredictorAdd##X##_VSX(const uint32_t* in, const uint32_t* upper, \
++                                    int num_pixels,                            \
++                                    uint32_t* WEBP_RESTRICT out) {             \
++    int i;                                                                     \
++    for (i = 0; i + 4 <= num_pixels; i += 4) {                                 \
++      const u8x16 src = (u8x16)vec_xl(0, (uint32_t*)&in[i]);                   \
++      const u8x16 other = (u8x16)vec_xl(0, (uint32_t*)&(IN));                  \
++      vec_xst((u32x4)vec_add(src, other), 0, &out[i]);                         \
++    }                                                                          \
++    if (i != num_pixels) {                                                     \
++      VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i);    \
++    }                                                                          \
++  }
++GENERATE_PREDICTOR_1_VSX(2, upper[i])      // Top.
++GENERATE_PREDICTOR_1_VSX(3, upper[i + 1])  // Top-right.
++GENERATE_PREDICTOR_1_VSX(4, upper[i - 1])  // Top-left.
++#undef GENERATE_PREDICTOR_1_VSX
++
++// Predictors 5, 6, 7, 13 use integer averages and cannot be accumulated in
++// parallel, so use the generic one-pixel-at-a-time batch.
++GENERATE_PREDICTOR_ADD(Predictor5_VSX, PredictorAdd5_VSX)
++GENERATE_PREDICTOR_ADD(Predictor6_VSX, PredictorAdd6_VSX)
++GENERATE_PREDICTOR_ADD(Predictor7_VSX, PredictorAdd7_VSX)
++GENERATE_PREDICTOR_ADD(Predictor13_VSX, PredictorAdd13_VSX)
++
++#define GENERATE_PREDICTOR_2_VSX(X, IN)                                        \
++  static void PredictorAdd##X##_VSX(const uint32_t* in, const uint32_t* upper, \
++                                    int num_pixels,                            \
++                                    uint32_t* WEBP_RESTRICT out) {             \
++    int i;                                                                     \
++    for (i = 0; i + 4 <= num_pixels; i += 4) {                                 \
++      const u8x16 Tother = (u8x16)vec_xl(0, (uint32_t*)&(IN));                 \
++      const u8x16 T = (u8x16)vec_xl(0, (uint32_t*)&upper[i]);                  \
++      const u8x16 src = (u8x16)vec_xl(0, (uint32_t*)&in[i]);                   \
++      vec_xst((u32x4)vec_add(Average2_u8(T, Tother), src), 0, &out[i]);        \
++    }                                                                          \
++    if (i != num_pixels) {                                                     \
++      VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i);    \
++    }                                                                          \
++  }
++GENERATE_PREDICTOR_2_VSX(8, upper[i - 1])  // Average TL, T.
++GENERATE_PREDICTOR_2_VSX(9, upper[i + 1])  // Average T, TR.
++#undef GENERATE_PREDICTOR_2_VSX
++
++// Predictor10: average of (average(L, TL), average(T, TR)).
++static void PredictorAdd10_VSX(const uint32_t* in, const uint32_t* upper,
++                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
++  u8x16 L = (u8x16)Lane0(out[-1]);
++  int i, k;
++  for (i = 0; i + 4 <= num_pixels; i += 4) {
++    u8x16 src = (u8x16)vec_xl(0, (uint32_t*)&in[i]);
++    u8x16 TL = (u8x16)vec_xl(0, (uint32_t*)&upper[i - 1]);
++    const u8x16 T = (u8x16)vec_xl(0, (uint32_t*)&upper[i]);
++    const u8x16 TR = (u8x16)vec_xl(0, (uint32_t*)&upper[i + 1]);
++    u8x16 avgTTR = Average2_u8(T, TR);
++    for (k = 0; k < 4; ++k) {
++      const u8x16 avg = Average2_u8(avgTTR, Average2_u8(L, TL));
++      L = vec_add(avg, src);
++      out[i + k] = vec_extract((u32x4)L, 0);
++      avgTTR = SRLI(avgTTR, 4);
++      TL = SRLI(TL, 4);
++      src = SRLI(src, 4);
++    }
++  }
++  if (i != num_pixels) {
++    VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
++  }
++}
++
++// Predictor11: select between T and L based on |T-TL| vs |L-TL|.
++static void PredictorAdd11_VSX(const uint32_t* in, const uint32_t* upper,
++                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
++  const u32x4 z32 = vec_splats((unsigned int)0);
++  u8x16 L = (u8x16)Lane0(out[-1]);
++  int i, k;
++  for (i = 0; i + 4 <= num_pixels; i += 4) {
++    u8x16 T = (u8x16)vec_xl(0, (uint32_t*)&upper[i]);
++    u8x16 TL = (u8x16)vec_xl(0, (uint32_t*)&upper[i - 1]);
++    u8x16 src = (u8x16)vec_xl(0, (uint32_t*)&in[i]);
++    u8x16 pa = (u8x16)vec_sum4s(vec_or(vec_subs(T, TL), vec_subs(TL, T)), z32);
++    for (k = 0; k < 4; ++k) {
++      const u32x4 pb = vec_sum4s(vec_or(vec_subs(L, TL), vec_subs(TL, L)), z32);
++      const u32x4 mask = (u32x4)vec_cmpgt(pb, (u32x4)pa);  // pb > pa ? L : T
++      const u8x16 pred = vec_sel(T, L, (u8x16)mask);
++      L = vec_add(src, pred);
++      out[i + k] = vec_extract((u32x4)L, 0);
++      T = SRLI(T, 4);
++      TL = SRLI(TL, 4);
++      src = SRLI(src, 4);
++      pa = SRLI(pa, 4);
++    }
++  }
++  if (i != num_pixels) {
++    VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
++  }
++}
++
++// Predictor12: ClampedAddSubtractFull. 'L' is kept unpacked to 16 bits in the
++// low 4 lanes; 'diff' (= T - TL) holds two pixels, the active one in lanes 0-3.
++#define DO_PRED12(DIFF)                                   \
++  do {                                                    \
++    const i16x8 all = vec_add((i16x8)L, (DIFF));          \
++    const u8x16 res = vec_add(src, vec_packsu(all, all)); \
++    out[i + out_idx++] = vec_extract((u32x4)res, 0);      \
++    L = (u16x8)vec_mergeh(res, kZero);                    \
++  } while (0)
++
++static void PredictorAdd12_VSX(const uint32_t* in, const uint32_t* upper,
++                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
++  u16x8 L = Unpack16(out[-1]);
++  int i;
++  for (i = 0; i + 4 <= num_pixels; i += 4) {
++    int out_idx = 0;
++    u8x16 src = (u8x16)vec_xl(0, (uint32_t*)&in[i]);
++    const u8x16 T = (u8x16)vec_xl(0, (uint32_t*)&upper[i]);
++    const u8x16 TL = (u8x16)vec_xl(0, (uint32_t*)&upper[i - 1]);
++    // 16-bit gradient basis T - TL for the four pixels (low and high halves).
++    i16x8 diff_lo =
++        vec_sub((i16x8)vec_mergeh(T, kZero), (i16x8)vec_mergeh(TL, kZero));
++    i16x8 diff_hi =
++        vec_sub((i16x8)vec_mergel(T, kZero), (i16x8)vec_mergel(TL, kZero));
++    DO_PRED12(diff_lo);
++    diff_lo = (i16x8)SRLI((u8x16)diff_lo, 8);
++    src = SRLI(src, 4);
++    DO_PRED12(diff_lo);
++    src = SRLI(src, 4);
++    DO_PRED12(diff_hi);
++    diff_hi = (i16x8)SRLI((u8x16)diff_hi, 8);
++    src = SRLI(src, 4);
++    DO_PRED12(diff_hi);
++  }
++  if (i != num_pixels) {
++    VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);
++  }
++}
++#undef DO_PRED12
++
++#undef SLLI
++#undef SRLI
++
++//------------------------------------------------------------------------------
++
++extern void VP8LDspInitVSX(void);
++
++WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitVSX(void) {
++  VP8LPredictorsAdd[0] = PredictorAdd0_VSX;
++  VP8LPredictorsAdd[1] = PredictorAdd1_VSX;
++  VP8LPredictorsAdd[2] = PredictorAdd2_VSX;
++  VP8LPredictorsAdd[3] = PredictorAdd3_VSX;
++  VP8LPredictorsAdd[4] = PredictorAdd4_VSX;
++  VP8LPredictorsAdd[5] = PredictorAdd5_VSX;
++  VP8LPredictorsAdd[6] = PredictorAdd6_VSX;
++  VP8LPredictorsAdd[7] = PredictorAdd7_VSX;
++  VP8LPredictorsAdd[8] = PredictorAdd8_VSX;
++  VP8LPredictorsAdd[9] = PredictorAdd9_VSX;
++  VP8LPredictorsAdd[10] = PredictorAdd10_VSX;
++  VP8LPredictorsAdd[11] = PredictorAdd11_VSX;
++  VP8LPredictorsAdd[12] = PredictorAdd12_VSX;
++  VP8LPredictorsAdd[13] = PredictorAdd13_VSX;
++
++  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_VSX;
++  VP8LTransformColorInverse = TransformColorInverse_VSX;
++  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_VSX;
++  VP8LConvertBGRAToRGB = ConvertBGRAToRGB_VSX;
++  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_VSX;
++}
++
++#else  // !WEBP_USE_VSX
++
++WEBP_DSP_INIT_STUB(VP8LDspInitVSX)
++
++#endif  // WEBP_USE_VSX
+diff --git a/media/libwebp/src/dsp/moz.build b/media/libwebp/src/dsp/moz.build
+index 8d6f8427c900..f3e9d1273110 100644
+--- a/media/libwebp/src/dsp/moz.build
++++ b/media/libwebp/src/dsp/moz.build
+@@ -118,6 +118,20 @@ elif CONFIG['TARGET_CPU'].startswith('mips'):
+         'yuv_mips32.c',
+         'yuv_mips_dsp_r2.c',
+     ]
++elif CONFIG['TARGET_CPU'] == 'ppc64':
++    SOURCES += [
++        'alpha_processing_vsx.c',
++        'dec_vsx.c',
++        'filters_vsx.c',
++        'lossless_vsx.c',
++        'rescaler_vsx.c',
++        'upsampling_vsx.c',
++        'yuv_vsx.c',
++    ]
++    DEFINES['WEBP_HAVE_VSX'] = 1;
++    for f in SOURCES:
++      if f.endswith('vsx.c'):
++        SOURCES[f].flags += ['-mvsx']
+ 
+ if CONFIG['CC_TYPE'] in ('clang', 'clang-cl'):
+     CFLAGS += ['-Wno-unreachable-code']
+diff --git a/media/libwebp/src/dsp/rescaler.c b/media/libwebp/src/dsp/rescaler.c
+index eafccd442f25..2c0c8c47a7a3 100644
+--- a/media/libwebp/src/dsp/rescaler.c
++++ b/media/libwebp/src/dsp/rescaler.c
+@@ -207,6 +207,7 @@ extern void WebPRescalerDspInitMIPS32(void);
+ extern void WebPRescalerDspInitMIPSdspR2(void);
+ extern void WebPRescalerDspInitMSA(void);
+ extern void WebPRescalerDspInitNEON(void);
++extern void WebPRescalerDspInitVSX(void);
+ 
+ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
+ #if !defined(WEBP_REDUCE_SIZE)
+@@ -238,6 +239,11 @@ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
+     if (VP8GetCPUInfo(kMSA)) {
+       WebPRescalerDspInitMSA();
+     }
++#endif
++#if defined(WEBP_HAVE_VSX)
++    if (VP8GetCPUInfo(kVSX)) {
++      WebPRescalerDspInitVSX();
++    }
+ #endif
+   }
+ 
+diff --git a/media/libwebp/src/dsp/rescaler_vsx.c b/media/libwebp/src/dsp/rescaler_vsx.c
+new file mode 100644
+index 000000000000..002f232d647a
+--- /dev/null
++++ b/media/libwebp/src/dsp/rescaler_vsx.c
+@@ -0,0 +1,201 @@
++// Copyright 2015 Google Inc. All Rights Reserved.
++//
++// Use of this source code is governed by a BSD-style license
++// that can be found in the COPYING file in the root of the source
++// tree. An additional intellectual property rights grant can be found
++// in the file PATENTS. All contributing project authors may
++// be found in the AUTHORS file in the root of the source tree.
++// -----------------------------------------------------------------------------
++//
++// VSX (PowerPC) version of rescaling functions.
++
++#include "src/dsp/dsp.h"
++
++#if defined(WEBP_USE_VSX) && !defined(WEBP_REDUCE_SIZE)
++
++#include <altivec.h>
++#include <assert.h>
++#include <string.h>
++
++#include "src/dsp/cpu.h"
++#include "src/utils/rescaler_utils.h"
++#include "src/webp/types.h"
++
++typedef __vector unsigned char u8x16;
++typedef __vector signed short i16x8;
++typedef __vector unsigned int u32x4;
++typedef __vector signed int i32x4;
++typedef __vector unsigned long long u64x2;
++
++#define ROUNDER (WEBP_RESCALER_ONE >> 1)
++#define MULT_FIX_C(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
++#define MULT_FIX_FLOOR_C(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
++
++#if (WEBP_RESCALER_RFIX != 32)
++#error "MULT_FIX/WEBP_RESCALER_RFIX need some more work"
++#endif
++
++// Returns (x * scale + ROUNDER) >> 32 for each of the four 32-bit lanes.
++static WEBP_INLINE u32x4 MultFix_VSX(u32x4 x, uint32_t scale) {
++  const u64x2 rounder = vec_splats((unsigned long long)ROUNDER);
++  const u64x2 shift = vec_splats((unsigned long long)WEBP_RESCALER_RFIX);
++  const u32x4 s = vec_splats(scale);
++  // vec_mule/vec_mulo produce the 32x32->64 products of the even (0, 2) and
++  // odd (1, 3) lanes respectively.
++  u64x2 e = vec_add(vec_mule(x, s), rounder);
++  u64x2 o = vec_add(vec_mulo(x, s), rounder);
++  e = vec_sr(e, shift);
++  o = vec_sr(o, shift);
++  return vec_mergee((u32x4)e, (u32x4)o);
++}
++
++// Returns (x * scale) >> 32 for each lane (no rounding).
++static WEBP_INLINE u32x4 MultFixFloor_VSX(u32x4 x, uint32_t scale) {
++  const u64x2 shift = vec_splats((unsigned long long)WEBP_RESCALER_RFIX);
++  const u32x4 s = vec_splats(scale);
++  u64x2 e = vec_sr(vec_mule(x, s), shift);
++  u64x2 o = vec_sr(vec_mulo(x, s), shift);
++  return vec_mergee((u32x4)e, (u32x4)o);
++}
++
++// Returns (A * frow + B * irow + ROUNDER) >> 32 for each lane.
++static WEBP_INLINE u32x4 Interpolate_VSX(const rescaler_t* WEBP_RESTRICT frow,
++                                         const rescaler_t* WEBP_RESTRICT irow,
++                                         uint32_t A, uint32_t B) {
++  const u64x2 rounder = vec_splats((unsigned long long)ROUNDER);
++  const u64x2 shift = vec_splats((unsigned long long)WEBP_RESCALER_RFIX);
++  const u32x4 f = vec_xl(0, (uint32_t*)frow);
++  const u32x4 ir = vec_xl(0, (uint32_t*)irow);
++  const u32x4 va = vec_splats(A);
++  const u32x4 vb = vec_splats(B);
++  u64x2 e = vec_add(vec_mule(f, va), vec_mule(ir, vb));
++  u64x2 o = vec_add(vec_mulo(f, va), vec_mulo(ir, vb));
++  e = vec_sr(vec_add(e, rounder), shift);
++  o = vec_sr(vec_add(o, rounder), shift);
++  return vec_mergee((u32x4)e, (u32x4)o);
++}
++
++// Saturated pack of two 32-bit lane vectors (8 values) into 8 bytes at dst.
++static WEBP_INLINE void Store8_VSX(u32x4 lo, u32x4 hi, uint8_t* dst) {
++  const i16x8 s16 = vec_packs((i32x4)lo, (i32x4)hi);
++  const u8x16 s8 = vec_packsu(s16, s16);
++  memcpy(dst, &s8, 8);
++}
++
++static void RescalerExportRowExpand_VSX(WebPRescaler* const wrk) {
++  int x_out;
++  uint8_t* const dst = wrk->dst;
++  rescaler_t* const irow = wrk->irow;
++  const int x_out_max = wrk->dst_width * wrk->num_channels;
++  const int max_span = x_out_max & ~7;
++  const rescaler_t* const frow = wrk->frow;
++  const uint32_t fy_scale = wrk->fy_scale;
++  assert(!WebPRescalerOutputDone(wrk));
++  assert(wrk->y_accum <= 0);
++  assert(wrk->y_expand);
++  assert(wrk->y_sub != 0);
++  if (wrk->y_accum == 0) {
++    for (x_out = 0; x_out < max_span; x_out += 8) {
++      const u32x4 A0 = vec_xl(0, (uint32_t*)(frow + x_out + 0));
++      const u32x4 A1 = vec_xl(0, (uint32_t*)(frow + x_out + 4));
++      const u32x4 B0 = MultFix_VSX(A0, fy_scale);
++      const u32x4 B1 = MultFix_VSX(A1, fy_scale);
++      Store8_VSX(B0, B1, dst + x_out);
++    }
++    for (; x_out < x_out_max; ++x_out) {
++      const uint32_t J = frow[x_out];
++      const int v = (int)MULT_FIX_C(J, fy_scale);
++      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
++    }
++  } else {
++    const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
++    const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
++    for (x_out = 0; x_out < max_span; x_out += 8) {
++      const u32x4 C0 =
++          Interpolate_VSX(frow + x_out + 0, irow + x_out + 0, A, B);
++      const u32x4 C1 =
++          Interpolate_VSX(frow + x_out + 4, irow + x_out + 4, A, B);
++      const u32x4 D0 = MultFix_VSX(C0, fy_scale);
++      const u32x4 D1 = MultFix_VSX(C1, fy_scale);
++      Store8_VSX(D0, D1, dst + x_out);
++    }
++    for (; x_out < x_out_max; ++x_out) {
++      const uint64_t I = (uint64_t)A * frow[x_out] + (uint64_t)B * irow[x_out];
++      const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
++      const int v = (int)MULT_FIX_C(J, fy_scale);
++      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
++    }
++  }
++}
++
++static void RescalerExportRowShrink_VSX(WebPRescaler* const wrk) {
++  int x_out;
++  uint8_t* const dst = wrk->dst;
++  rescaler_t* const irow = wrk->irow;
++  const int x_out_max = wrk->dst_width * wrk->num_channels;
++  const int max_span = x_out_max & ~7;
++  const rescaler_t* const frow = wrk->frow;
++  const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
++  const uint32_t fxy_scale = wrk->fxy_scale;
++  assert(!WebPRescalerOutputDone(wrk));
++  assert(wrk->y_accum <= 0);
++  assert(!wrk->y_expand);
++  if (yscale) {
++    for (x_out = 0; x_out < max_span; x_out += 8) {
++      const u32x4 in0 = vec_xl(0, (uint32_t*)(frow + x_out + 0));
++      const u32x4 in1 = vec_xl(0, (uint32_t*)(frow + x_out + 4));
++      const u32x4 in2 = vec_xl(0, (uint32_t*)(irow + x_out + 0));
++      const u32x4 in3 = vec_xl(0, (uint32_t*)(irow + x_out + 4));
++      const u32x4 A0 = MultFixFloor_VSX(in0, yscale);
++      const u32x4 A1 = MultFixFloor_VSX(in1, yscale);
++      const u32x4 B0 = vec_sub(in2, A0);
++      const u32x4 B1 = vec_sub(in3, A1);
++      const u32x4 C0 = MultFix_VSX(B0, fxy_scale);
++      const u32x4 C1 = MultFix_VSX(B1, fxy_scale);
++      Store8_VSX(C0, C1, dst + x_out);
++      vec_xst(A0, 0, (uint32_t*)(irow + x_out + 0));
++      vec_xst(A1, 0, (uint32_t*)(irow + x_out + 4));
++    }
++    for (; x_out < x_out_max; ++x_out) {
++      const uint32_t frac = (uint32_t)MULT_FIX_FLOOR_C(frow[x_out], yscale);
++      const int v = (int)MULT_FIX_C(irow[x_out] - frac, fxy_scale);
++      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
++      irow[x_out] = frac;  // new fractional start
++    }
++  } else {
++    const u32x4 zero = vec_splats((uint32_t)0);
++    for (x_out = 0; x_out < max_span; x_out += 8) {
++      const u32x4 in0 = vec_xl(0, (uint32_t*)(irow + x_out + 0));
++      const u32x4 in1 = vec_xl(0, (uint32_t*)(irow + x_out + 4));
++      const u32x4 A0 = MultFix_VSX(in0, fxy_scale);
++      const u32x4 A1 = MultFix_VSX(in1, fxy_scale);
++      Store8_VSX(A0, A1, dst + x_out);
++      vec_xst(zero, 0, (uint32_t*)(irow + x_out + 0));
++      vec_xst(zero, 0, (uint32_t*)(irow + x_out + 4));
++    }
++    for (; x_out < x_out_max; ++x_out) {
++      const int v = (int)MULT_FIX_C(irow[x_out], fxy_scale);
++      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
++      irow[x_out] = 0;
++    }
++  }
++}
++
++#undef MULT_FIX_FLOOR_C
++#undef MULT_FIX_C
++#undef ROUNDER
++
++//------------------------------------------------------------------------------
++
++extern void WebPRescalerDspInitVSX(void);
++
++WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitVSX(void) {
++  WebPRescalerExportRowExpand = RescalerExportRowExpand_VSX;
++  WebPRescalerExportRowShrink = RescalerExportRowShrink_VSX;
++}
++
++#else  // !WEBP_USE_VSX
++
++WEBP_DSP_INIT_STUB(WebPRescalerDspInitVSX)
++
++#endif  // WEBP_USE_VSX
+diff --git a/media/libwebp/src/dsp/upsampling.c b/media/libwebp/src/dsp/upsampling.c
+index c57f66c3553f..faecdf277393 100644
+--- a/media/libwebp/src/dsp/upsampling.c
++++ b/media/libwebp/src/dsp/upsampling.c
+@@ -235,6 +235,7 @@ extern VP8CPUInfo VP8GetCPUInfo;
+ extern void WebPInitYUV444ConvertersMIPSdspR2(void);
+ extern void WebPInitYUV444ConvertersSSE2(void);
+ extern void WebPInitYUV444ConvertersSSE41(void);
++extern void WebPInitYUV444ConvertersVSX(void);
+ 
+ WEBP_DSP_INIT_FUNC(WebPInitYUV444Converters) {
+   WebPYUV444Converters[MODE_RGBA]      = WebPYuv444ToRgba_C;
+@@ -264,6 +265,11 @@ WEBP_DSP_INIT_FUNC(WebPInitYUV444Converters) {
+     if (VP8GetCPUInfo(kMIPSdspR2)) {
+       WebPInitYUV444ConvertersMIPSdspR2();
+     }
++#endif
++#if defined(WEBP_HAVE_VSX)
++    if (VP8GetCPUInfo(kVSX)) {
++      WebPInitYUV444ConvertersVSX();
++    }
+ #endif
+   }
+ }
+@@ -276,6 +282,7 @@ extern void WebPInitUpsamplersSSE41(void);
+ extern void WebPInitUpsamplersNEON(void);
+ extern void WebPInitUpsamplersMIPSdspR2(void);
+ extern void WebPInitUpsamplersMSA(void);
++extern void WebPInitUpsamplersVSX(void);
+ 
+ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
+ #ifdef FANCY_UPSAMPLING
+@@ -314,6 +321,11 @@ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
+     if (VP8GetCPUInfo(kMSA)) {
+       WebPInitUpsamplersMSA();
+     }
++#endif
++#if defined(WEBP_HAVE_VSX)
++    if (VP8GetCPUInfo(kVSX)) {
++      WebPInitUpsamplersVSX();
++    }
+ #endif
+   }
+ 
+diff --git a/media/libwebp/src/dsp/upsampling_vsx.c b/media/libwebp/src/dsp/upsampling_vsx.c
+new file mode 100644
+index 000000000000..a7191972fc6e
+--- /dev/null
++++ b/media/libwebp/src/dsp/upsampling_vsx.c
+@@ -0,0 +1,151 @@
++// Copyright 2011 Google Inc. All Rights Reserved.
++//
++// Use of this source code is governed by a BSD-style license
++// that can be found in the COPYING file in the root of the source
++// tree. An additional intellectual property rights grant can be found
++// in the file PATENTS. All contributing project authors may
++// be found in the AUTHORS file in the root of the source tree.
++// -----------------------------------------------------------------------------
++//
++// VSX (PowerPC) version of YUV to RGB upsampling functions.
++
++#include "src/dsp/dsp.h"
++
++#if defined(WEBP_USE_VSX)
++
++#include <altivec.h>
++#include <assert.h>
++#include <string.h>
++
++#include "src/dsp/cpu.h"
++#include "src/dsp/yuv.h"
++#include "src/webp/decode.h"
++#include "src/webp/types.h"
++
++typedef __vector unsigned char u8x16;
++typedef __vector unsigned short u16x8;
++
++// Upsample 16 chroma pairs from rows r1/r2 (17 readable bytes each) into 32
++// "top" bytes at out[0..31] and 32 "bottom" bytes at out[64..95], matching the
++// fancy-upsampler diagonal weights (a + 3b + 3c + d) / 8 etc.
++#define GET_M(ij, in) \
++  vec_sub(vec_avg(k, (in)), \
++          vec_and(vec_or(vec_and((ij), st), vec_xor(k, (in))), one))
++
++static void Upsample32Pixels(const uint8_t* WEBP_RESTRICT r1,
++                             const uint8_t* WEBP_RESTRICT r2,
++                             uint8_t* WEBP_RESTRICT out) {
++  const u8x16 one = vec_splats((unsigned char)1);
++  const u8x16 a = vec_xl(0, (const unsigned char*)r1);
++  const u8x16 b = vec_xl(1, (const unsigned char*)r1);
++  const u8x16 c = vec_xl(0, (const unsigned char*)r2);
++  const u8x16 d = vec_xl(1, (const unsigned char*)r2);
++  const u8x16 s = vec_avg(a, d);
++  const u8x16 t = vec_avg(b, c);
++  const u8x16 st = vec_xor(s, t);
++  const u8x16 t3 =
++      vec_and(vec_or(vec_or(vec_xor(a, d), vec_xor(b, c)), st), one);
++  const u8x16 k = vec_sub(vec_avg(s, t), t3);
++  const u8x16 diag1 = GET_M(vec_xor(b, c), t);
++  const u8x16 diag2 = GET_M(vec_xor(a, d), s);
++  const u8x16 ta = vec_avg(a, diag1), tb = vec_avg(b, diag2);
++  const u8x16 tc = vec_avg(c, diag2), td = vec_avg(d, diag1);
++  vec_xst(vec_mergeh(ta, tb), 0, out);
++  vec_xst(vec_mergel(ta, tb), 0, out + 16);
++  vec_xst(vec_mergeh(tc, td), 0, out + 64);
++  vec_xst(vec_mergel(tc, td), 0, out + 80);
++}
++
++#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, FUNC32)                                 \
++static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y,                      \
++                      const uint8_t* WEBP_RESTRICT bottom_y,                   \
++                      const uint8_t* WEBP_RESTRICT top_u,                      \
++                      const uint8_t* WEBP_RESTRICT top_v,                      \
++                      const uint8_t* WEBP_RESTRICT cur_u,                      \
++                      const uint8_t* WEBP_RESTRICT cur_v,                      \
++                      uint8_t* WEBP_RESTRICT top_dst,                          \
++                      uint8_t* WEBP_RESTRICT bottom_dst, int len) {            \
++  int uv_pos, pos;                                                            \
++  uint8_t uv_buf[14 * 32 + 15] = {0};                                         \
++  uint8_t* const r_u = (uint8_t*)(((uintptr_t)(uv_buf + 15)) & ~(uintptr_t)15);\
++  uint8_t* const r_v = r_u + 32;                                              \
++  assert(top_y != NULL);                                                      \
++  {                                                                           \
++    const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                      \
++    const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                      \
++    FUNC(top_y[0], (top_u[0] + u_diag) >> 1, (top_v[0] + v_diag) >> 1,        \
++         top_dst);                                                           \
++    if (bottom_y != NULL) {                                                   \
++      FUNC(bottom_y[0], (cur_u[0] + u_diag) >> 1, (cur_v[0] + v_diag) >> 1,   \
++           bottom_dst);                                                      \
++    }                                                                         \
++  }                                                                           \
++  for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) {   \
++    Upsample32Pixels(top_u + uv_pos, cur_u + uv_pos, r_u);                    \
++    Upsample32Pixels(top_v + uv_pos, cur_v + uv_pos, r_v);                    \
++    FUNC32(top_y + pos, r_u, r_v, top_dst + pos * 4);                         \
++    if (bottom_y != NULL) {                                                   \
++      FUNC32(bottom_y + pos, r_u + 64, r_v + 64, bottom_dst + pos * 4);       \
++    }                                                                         \
++  }                                                                           \
++  if (len > 1) {                                                              \
++    const int left_over = ((len + 1) >> 1) - (pos >> 1);                      \
++    uint8_t* const tmp_top_dst = r_u + 4 * 32;                                \
++    uint8_t* const tmp_bottom_dst = tmp_top_dst + 4 * 32;                     \
++    uint8_t* const tmp_top = tmp_bottom_dst + 4 * 32;                         \
++    uint8_t* const tmp_bottom = (bottom_y == NULL) ? NULL : tmp_top + 32;     \
++    uint8_t r1[17], r2[17];                                                   \
++    assert(left_over > 0);                                                    \
++    memcpy(r1, top_u + uv_pos, left_over);                                    \
++    memcpy(r2, cur_u + uv_pos, left_over);                                    \
++    memset(r1 + left_over, r1[left_over - 1], 17 - left_over);                \
++    memset(r2 + left_over, r2[left_over - 1], 17 - left_over);                \
++    Upsample32Pixels(r1, r2, r_u);                                            \
++    memcpy(r1, top_v + uv_pos, left_over);                                    \
++    memcpy(r2, cur_v + uv_pos, left_over);                                    \
++    memset(r1 + left_over, r1[left_over - 1], 17 - left_over);                \
++    memset(r2 + left_over, r2[left_over - 1], 17 - left_over);                \
++    Upsample32Pixels(r1, r2, r_v);                                            \
++    memcpy(tmp_top, top_y + pos, len - pos);                                  \
++    if (bottom_y != NULL) memcpy(tmp_bottom, bottom_y + pos, len - pos);      \
++    FUNC32(tmp_top, r_u, r_v, tmp_top_dst);                                   \
++    if (bottom_y != NULL) FUNC32(tmp_bottom, r_u + 64, r_v + 64,             \
++                                 tmp_bottom_dst);                            \
++    memcpy(top_dst + pos * 4, tmp_top_dst, (len - pos) * 4);                  \
++    if (bottom_y != NULL) {                                                   \
++      memcpy(bottom_dst + pos * 4, tmp_bottom_dst, (len - pos) * 4);          \
++    }                                                                         \
++  }                                                                           \
++}
++
++UPSAMPLE_FUNC(UpsampleRgbaLinePair_VSX, VP8YuvToRgba, VP8YuvToRgba32_VSX)
++UPSAMPLE_FUNC(UpsampleBgraLinePair_VSX, VP8YuvToBgra, VP8YuvToBgra32_VSX)
++UPSAMPLE_FUNC(UpsampleArgbLinePair_VSX, VP8YuvToArgb, VP8YuvToArgb32_VSX)
++
++extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
++
++extern void WebPInitUpsamplersVSX(void);
++
++WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersVSX(void) {
++  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_VSX;
++  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_VSX;
++  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_VSX;
++  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_VSX;
++#if !defined(WEBP_REDUCE_CSP)
++  WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_VSX;
++  WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_VSX;
++#endif
++}
++
++extern void WebPInitYUV444ConvertersVSX(void);
++
++// YUV444 point converters stay on the C path for now.
++WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersVSX(void) {}
++
++#else  // !WEBP_USE_VSX
++
++WEBP_DSP_INIT_STUB(WebPInitYUV444ConvertersVSX)
++
++WEBP_DSP_INIT_STUB(WebPInitUpsamplersVSX)
++
++#endif  // WEBP_USE_VSX
+diff --git a/media/libwebp/src/dsp/yuv.c b/media/libwebp/src/dsp/yuv.c
+index 62f1ecc1567d..9a95c5de1e23 100644
+--- a/media/libwebp/src/dsp/yuv.c
++++ b/media/libwebp/src/dsp/yuv.c
+@@ -81,6 +81,7 @@ extern void WebPInitSamplersSSE2(void);
+ extern void WebPInitSamplersSSE41(void);
+ extern void WebPInitSamplersMIPS32(void);
+ extern void WebPInitSamplersMIPSdspR2(void);
++extern void WebPInitSamplersVSX(void);
+ 
+ WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
+   WebPSamplers[MODE_RGB]       = YuvToRgbRow;
+@@ -117,6 +118,11 @@ WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
+       WebPInitSamplersMIPSdspR2();
+     }
+ #endif  // WEBP_USE_MIPS_DSP_R2
++#if defined(WEBP_HAVE_VSX)
++    if (VP8GetCPUInfo(kVSX)) {
++      WebPInitSamplersVSX();
++    }
++#endif
+   }
+ }
+ 
+diff --git a/media/libwebp/src/dsp/yuv.h b/media/libwebp/src/dsp/yuv.h
+index 6f218cf7e07f..979891d3232d 100644
+--- a/media/libwebp/src/dsp/yuv.h
++++ b/media/libwebp/src/dsp/yuv.h
+@@ -182,6 +182,27 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* WEBP_RESTRICT y,
+ 
+ #endif    // WEBP_USE_SSE2
+ 
++//-----------------------------------------------------------------------------
++// VSX extra functions (mostly for upsampling_vsx.c)
++
++#if defined(WEBP_USE_VSX)
++
++// Process 32 pixels and store the 32b-per-pixel result in *dst.
++void VP8YuvToRgba32_VSX(const uint8_t* WEBP_RESTRICT y,
++                        const uint8_t* WEBP_RESTRICT u,
++                        const uint8_t* WEBP_RESTRICT v,
++                        uint8_t* WEBP_RESTRICT dst);
++void VP8YuvToBgra32_VSX(const uint8_t* WEBP_RESTRICT y,
++                        const uint8_t* WEBP_RESTRICT u,
++                        const uint8_t* WEBP_RESTRICT v,
++                        uint8_t* WEBP_RESTRICT dst);
++void VP8YuvToArgb32_VSX(const uint8_t* WEBP_RESTRICT y,
++                        const uint8_t* WEBP_RESTRICT u,
++                        const uint8_t* WEBP_RESTRICT v,
++                        uint8_t* WEBP_RESTRICT dst);
++
++#endif    // WEBP_USE_VSX
++
+ //-----------------------------------------------------------------------------
+ // SSE41 extra functions (mostly for upsampling_sse41.c)
+ 
+diff --git a/media/libwebp/src/dsp/yuv_vsx.c b/media/libwebp/src/dsp/yuv_vsx.c
+new file mode 100644
+index 000000000000..1fdc5c80ba16
+--- /dev/null
++++ b/media/libwebp/src/dsp/yuv_vsx.c
+@@ -0,0 +1,206 @@
++// Copyright 2014 Google Inc. All Rights Reserved.
++//
++// Use of this source code is governed by a BSD-style license
++// that can be found in the COPYING file in the root of the source
++// tree. An additional intellectual property rights grant can be found
++// in the file PATENTS. All contributing project authors may
++// be found in the AUTHORS file in the root of the source tree.
++// -----------------------------------------------------------------------------
++//
++// VSX (PowerPC) version of YUV->RGB conversion functions.
++
++#include "src/dsp/dsp.h"
++
++#if defined(WEBP_USE_VSX)
++
++#include <altivec.h>
++#include <string.h>
++
++#include "src/dsp/yuv.h"
++
++typedef __vector unsigned char  u8x16;
++typedef __vector unsigned short u16x8;
++typedef __vector signed   short i16x8;
++typedef __vector unsigned int   u32x4;
++
++// POWER8 has no "multiply-high unsigned halfword", so emulate _mm_mulhi_epu16
++// via even/odd 16x16->32 products, >>16, then interleave back.
++static WEBP_INLINE u16x8 MulHi16(u16x8 a, u16x8 b) {
++  const u32x4 sh = vec_splats((unsigned int)16);
++  const u32x4 e = vec_sr(vec_mule(a, b), sh);
++  const u32x4 o = vec_sr(vec_mulo(a, b), sh);
++  return vec_pack(vec_mergeh(e, o), vec_mergel(e, o));
++}
++
++// 14b fixed-point ITU-R BT.601 YUV->RGB, matching the SSE2/scalar path.
++// Inputs are samples pre-shifted into the high byte (<< 8).
++static WEBP_INLINE void ConvertYUV444ToRGB(u16x8 Y0, u16x8 U0, u16x8 V0,
++                                           i16x8* const R, i16x8* const G,
++                                           u16x8* const B) {
++  const u16x8 k19077 = vec_splats((unsigned short)19077);
++  const u16x8 k26149 = vec_splats((unsigned short)26149);
++  const u16x8 k14234 = vec_splats((unsigned short)14234);
++  const u16x8 k33050 = vec_splats((unsigned short)33050);
++  const u16x8 k17685 = vec_splats((unsigned short)17685);
++  const u16x8 k6419  = vec_splats((unsigned short)6419);
++  const u16x8 k13320 = vec_splats((unsigned short)13320);
++  const u16x8 k8708  = vec_splats((unsigned short)8708);
++  const u16x8 six    = vec_splats((unsigned short)6);
++
++  const u16x8 Y1 = MulHi16(Y0, k19077);
++  const u16x8 R2 = vec_add(vec_sub(Y1, k14234), MulHi16(V0, k26149));
++  const u16x8 G4 = vec_sub(vec_add(Y1, k8708),
++                           vec_add(MulHi16(U0, k6419), MulHi16(V0, k13320)));
++  // 33050 needs unsigned saturating arithmetic; B can exceed 32767.
++  const u16x8 B2 = vec_subs(vec_adds(MulHi16(U0, k33050), Y1), k17685);
++
++  *R = vec_sra((i16x8)R2, six);
++  *G = vec_sra((i16x8)G4, six);
++  *B = vec_sr(B2, six);
++}
++
++// Load 8 bytes into the high byte of 8 u16 lanes (i.e. sample << 8).
++// Use an 8-byte copy (not a 16-byte vector load) to avoid reading past the
++// end of the source row, matching the SSE2 _mm_loadl_epi64 behavior.
++static WEBP_INLINE u16x8 LoadHi16(const uint8_t* WEBP_RESTRICT src) {
++  const u8x16 zero = vec_splats((unsigned char)0);
++  unsigned char tmp[16] = {0};
++  memcpy(tmp, src, 8);
++  return (u16x8)vec_mergeh(zero, vec_xl(0, tmp));
++}
++
++// Load 4 U/V bytes, shift into the high byte, and replicate each sample.
++static WEBP_INLINE u16x8 LoadUVHi8(const uint8_t* WEBP_RESTRICT src) {
++  const u8x16 zero = vec_splats((unsigned char)0);
++  unsigned char tmp[16] = {0};
++  memcpy(tmp, src, 4);
++  const u16x8 t = (u16x8)vec_mergeh(zero, vec_xl(0, tmp));
++  return vec_mergeh(t, t);
++}
++
++static WEBP_INLINE void YUV420ToRGB(const uint8_t* WEBP_RESTRICT y,
++                                    const uint8_t* WEBP_RESTRICT u,
++                                    const uint8_t* WEBP_RESTRICT v,
++                                    i16x8* const R, i16x8* const G,
++                                    u16x8* const B) {
++  ConvertYUV444ToRGB(LoadHi16(y), LoadUVHi8(u), LoadUVHi8(v), R, G, B);
++}
++
++// Pack four 8-lane channels into 32 interleaved bytes (c0 c1 c2 c3 per pixel).
++static WEBP_INLINE void PackAndStore4(i16x8 c0, i16x8 c1, i16x8 c2, i16x8 c3,
++                                      uint8_t* WEBP_RESTRICT dst) {
++  const u8x16 c02 = vec_packsu(c0, c2);
++  const u8x16 c13 = vec_packsu(c1, c3);
++  const u8x16 lo8 = vec_mergeh(c02, c13);
++  const u8x16 hi8 = vec_mergel(c02, c13);
++  vec_xst((u8x16)vec_mergeh((u16x8)lo8, (u16x8)hi8), 0, dst);
++  vec_xst((u8x16)vec_mergel((u16x8)lo8, (u16x8)hi8), 0, dst + 16);
++}
++
++static const i16x8 kAlpha = {255, 255, 255, 255, 255, 255, 255, 255};
++
++static void YuvToRgbaRow_VSX(const uint8_t* WEBP_RESTRICT y,
++                             const uint8_t* WEBP_RESTRICT u,
++                             const uint8_t* WEBP_RESTRICT v,
++                             uint8_t* WEBP_RESTRICT dst, int len) {
++  int n;
++  for (n = 0; n + 8 <= len; n += 8, dst += 32) {
++    i16x8 R, G; u16x8 B;
++    YUV420ToRGB(y, u, v, &R, &G, &B);
++    PackAndStore4(R, G, (i16x8)B, kAlpha, dst);
++    y += 8; u += 4; v += 4;
++  }
++  for (; n < len; ++n) {
++    VP8YuvToRgba(y[0], u[0], v[0], dst);
++    dst += 4; y += 1; u += (n & 1); v += (n & 1);
++  }
++}
++
++static void YuvToBgraRow_VSX(const uint8_t* WEBP_RESTRICT y,
++                             const uint8_t* WEBP_RESTRICT u,
++                             const uint8_t* WEBP_RESTRICT v,
++                             uint8_t* WEBP_RESTRICT dst, int len) {
++  int n;
++  for (n = 0; n + 8 <= len; n += 8, dst += 32) {
++    i16x8 R, G; u16x8 B;
++    YUV420ToRGB(y, u, v, &R, &G, &B);
++    PackAndStore4((i16x8)B, G, R, kAlpha, dst);
++    y += 8; u += 4; v += 4;
++  }
++  for (; n < len; ++n) {
++    VP8YuvToBgra(y[0], u[0], v[0], dst);
++    dst += 4; y += 1; u += (n & 1); v += (n & 1);
++  }
++}
++
++static void YuvToArgbRow_VSX(const uint8_t* WEBP_RESTRICT y,
++                             const uint8_t* WEBP_RESTRICT u,
++                             const uint8_t* WEBP_RESTRICT v,
++                             uint8_t* WEBP_RESTRICT dst, int len) {
++  int n;
++  for (n = 0; n + 8 <= len; n += 8, dst += 32) {
++    i16x8 R, G; u16x8 B;
++    YUV420ToRGB(y, u, v, &R, &G, &B);
++    PackAndStore4(kAlpha, R, G, (i16x8)B, dst);
++    y += 8; u += 4; v += 4;
++  }
++  for (; n < len; ++n) {
++    VP8YuvToArgb(y[0], u[0], v[0], dst);
++    dst += 4; y += 1; u += (n & 1); v += (n & 1);
++  }
++}
++
++// Convert 32 YUV444 pixels and store the 32b-per-pixel result. Used by the
++// fancy upsampler in upsampling_vsx.c.
++void VP8YuvToRgba32_VSX(const uint8_t* WEBP_RESTRICT y,
++                        const uint8_t* WEBP_RESTRICT u,
++                        const uint8_t* WEBP_RESTRICT v,
++                        uint8_t* WEBP_RESTRICT dst) {
++  int n;
++  for (n = 0; n < 32; n += 8, dst += 32) {
++    i16x8 R, G; u16x8 B;
++    ConvertYUV444ToRGB(LoadHi16(y + n), LoadHi16(u + n), LoadHi16(v + n),
++                       &R, &G, &B);
++    PackAndStore4(R, G, (i16x8)B, kAlpha, dst);
++  }
++}
++
++void VP8YuvToBgra32_VSX(const uint8_t* WEBP_RESTRICT y,
++                        const uint8_t* WEBP_RESTRICT u,
++                        const uint8_t* WEBP_RESTRICT v,
++                        uint8_t* WEBP_RESTRICT dst) {
++  int n;
++  for (n = 0; n < 32; n += 8, dst += 32) {
++    i16x8 R, G; u16x8 B;
++    ConvertYUV444ToRGB(LoadHi16(y + n), LoadHi16(u + n), LoadHi16(v + n),
++                       &R, &G, &B);
++    PackAndStore4((i16x8)B, G, R, kAlpha, dst);
++  }
++}
++
++void VP8YuvToArgb32_VSX(const uint8_t* WEBP_RESTRICT y,
++                        const uint8_t* WEBP_RESTRICT u,
++                        const uint8_t* WEBP_RESTRICT v,
++                        uint8_t* WEBP_RESTRICT dst) {
++  int n;
++  for (n = 0; n < 32; n += 8, dst += 32) {
++    i16x8 R, G; u16x8 B;
++    ConvertYUV444ToRGB(LoadHi16(y + n), LoadHi16(u + n), LoadHi16(v + n),
++                       &R, &G, &B);
++    PackAndStore4(kAlpha, R, G, (i16x8)B, dst);
++  }
++}
++
++extern void WebPInitSamplersVSX(void);
++
++WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersVSX(void) {
++  WebPSamplers[MODE_RGBA] = YuvToRgbaRow_VSX;
++  WebPSamplers[MODE_BGRA] = YuvToBgraRow_VSX;
++  WebPSamplers[MODE_ARGB] = YuvToArgbRow_VSX;
++}
++
++#else  // !WEBP_USE_VSX
++
++WEBP_DSP_INIT_STUB(WebPInitSamplersVSX)
++
++#endif  // WEBP_USE_VSX
+diff --git a/media/libwebp/src/moz/cpu.cpp b/media/libwebp/src/moz/cpu.cpp
+index c6633170c923..82986d2f631e 100644
+--- a/media/libwebp/src/moz/cpu.cpp
++++ b/media/libwebp/src/moz/cpu.cpp
+@@ -35,6 +35,10 @@ static int MozCPUInfo(CPUFeature feature)
+     case kMIPSdspR2:
+     case kMSA:
+       return 1;
++#endif
++#if defined(WEBP_USE_VSX)
++    case kVSX:
++      return 1;
+ #endif
+     default:
+       return 0;
+-- 
+2.52.0
+

diff --git a/0003-Add-PPC64LE-JIT-backend.patch b/0003-Add-PPC64LE-JIT-backend.patch
new file mode 100644
index 0000000..ee08b33
--- /dev/null
+++ b/0003-Add-PPC64LE-JIT-backend.patch
@@ -0,0 +1,38205 @@
+From c79926e41764c6aa6ae596812b23bc35b470028c Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Trung=20L=C3=AA?= <8@tle.id.au>
+Date: Fri, 12 Jun 2026 16:02:28 +1000
+Subject: [PATCH 3/3] Add PPC64LE JIT backend
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Based on the work done by Cameron Kaiser and Justin Hibbits
+https://github.com/chmeeedalf/gecko-dev
+
+Co-authored-by: Cameron Kaiser <classilla@floodgap.com>
+Co-authored-by: Justin Hibbits <chmeeedalf@gmail.com>
+Assisted-by: Lance Albertson <lance@osuosl.org>
+Assisted-by: Thushan Fernando <thushan@thushanfernando.com>
+Assisted-by: Timothy Pearson <tpearson@solidsilicon.com>
+Assisted-by: Dan Horák <dan@danny.cz>
+Assisted-by: Hiếu Lê <modology@gmail.com>
+Assisted-by: Claude Fable 5 <noreply@anthropic.com>
+---
+ config/check_macroassembler_style.py          |    2 +
+ js/moz.configure                              |   34 +-
+ js/src/builtin/TestingFunctions.cpp           |   18 +
+ js/src/irregexp/RegExpAPI.cpp                 |    5 +-
+ .../irregexp/RegExpNativeMacroAssembler.cpp   |   28 +
+ .../tests/baseline/ppc64-branch8-16-narrow.js |  103 +
+ js/src/jit-test/tests/gc/gcparam.js           |    3 +-
+ .../tests/ion/mod-constant-pow2-minus-one.js  |   78 +
+ .../tests/ion/mod-pow2-negative-dividend.js   |   71 +
+ .../tests/math-min-max-corner-cases.js        |   50 +
+ js/src/jit-test/tests/wasm/atomicity.js       |    8 +-
+ .../jit-test/tests/wasm/excessive-inlining.js |   19 +-
+ .../jit-test/tests/wasm/memory-oob-message.js |   10 +-
+ .../tests/wasm/ppc64-argon2-tiering.js        |  124 +
+ .../tests/wasm/ppc64-compare-select-bench.js  |   70 +
+ .../jit-test/tests/wasm/ppc64-extmul-alias.js |  107 +
+ .../tests/wasm/ppc64-simd-vr-clobber.js       |  179 +
+ js/src/jit-test/tests/wasm/profiling.js       |    7 +
+ .../wasm/regress-ppc64-extract-lane-ctz.js    |   49 +
+ .../wasm/regress-ppc64-select-condition.js    |   30 +
+ .../wasm/regress-ppc64-trap-exit-simd-save.js |   64 +
+ .../bug-ppc64-simd-reduce-and-branch.js       |    7 +
+ .../bug-ppc64-simd-reduce-and-branch.wasm     |  Bin 0 -> 1148 bytes
+ js/src/jit-test/tests/wasm/simd/bug1946618.js |    7 +-
+ .../jit-test/tests/wasm/simd/ion-analysis.js  |    7 +-
+ js/src/jit/Assembler.h                        |    2 +
+ js/src/jit/BaselineIC.cpp                     |    2 +
+ js/src/jit/CacheIRCompiler.cpp                |   16 +
+ js/src/jit/CodeGenerator.cpp                  |    6 +
+ js/src/jit/CodeGenerator.h                    |    2 +
+ js/src/jit/EffectiveAddressAnalysis.cpp       |    2 +-
+ js/src/jit/ExecutableAllocator.cpp            |   10 +-
+ js/src/jit/FlushICache.cpp                    |    3 +-
+ js/src/jit/FlushICache.h                      |   11 +-
+ js/src/jit/GenerateABIFunctionType.py         |  100 +
+ js/src/jit/JitContext.cpp                     |    4 +
+ js/src/jit/JitFrames.cpp                      |   10 +
+ js/src/jit/JitFrames.h                        |   12 +-
+ js/src/jit/LIR.cpp                            |    4 +-
+ js/src/jit/LIR.h                              |   10 +-
+ js/src/jit/LIROps.yaml                        |   82 +-
+ js/src/jit/Label.h                            |    2 +-
+ js/src/jit/Lowering.cpp                       |    2 +-
+ js/src/jit/Lowering.h                         |    2 +
+ js/src/jit/MacroAssembler-inl.h               |    2 +
+ js/src/jit/MacroAssembler.cpp                 |   25 +-
+ js/src/jit/MacroAssembler.h                   |  647 +-
+ js/src/jit/MoveEmitter.h                      |    2 +
+ js/src/jit/MoveResolver.cpp                   |   16 +
+ js/src/jit/RegisterAllocator.h                |    7 +-
+ js/src/jit/Registers.h                        |    2 +
+ js/src/jit/Safepoints.cpp                     |   11 +
+ js/src/jit/SharedICHelpers-inl.h              |    2 +
+ js/src/jit/SharedICHelpers.h                  |    2 +
+ js/src/jit/SharedICRegisters.h                |    2 +
+ js/src/jit/Simulator.h                        |    2 +
+ js/src/jit/moz.build                          |   12 +
+ js/src/jit/ppc64/Architecture-ppc64.cpp       |  221 +
+ js/src/jit/ppc64/Architecture-ppc64.h         |  581 ++
+ js/src/jit/ppc64/Assembler-ppc64.cpp          | 3028 +++++++
+ js/src/jit/ppc64/Assembler-ppc64.h            | 2114 +++++
+ js/src/jit/ppc64/CodeGenerator-ppc64.cpp      | 3647 ++++++++
+ js/src/jit/ppc64/CodeGenerator-ppc64.h        |  101 +
+ js/src/jit/ppc64/LIR-ppc64.h                  |  135 +
+ js/src/jit/ppc64/Lowering-ppc64.cpp           | 1324 +++
+ js/src/jit/ppc64/Lowering-ppc64.h             |  105 +
+ js/src/jit/ppc64/MacroAssembler-ppc64-inl.h   | 6142 ++++++++++++++
+ js/src/jit/ppc64/MacroAssembler-ppc64.cpp     | 3467 ++++++++
+ js/src/jit/ppc64/MacroAssembler-ppc64.h       | 2031 +++++
+ js/src/jit/ppc64/MoveEmitter-ppc64.cpp        |  357 +
+ js/src/jit/ppc64/MoveEmitter-ppc64.h          |   64 +
+ js/src/jit/ppc64/SharedICHelpers-ppc64-inl.h  |   83 +
+ js/src/jit/ppc64/SharedICHelpers-ppc64.h      |   97 +
+ js/src/jit/ppc64/SharedICRegisters-ppc64.h    |   46 +
+ js/src/jit/ppc64/Simulator-ppc64.cpp          | 7296 +++++++++++++++++
+ js/src/jit/ppc64/Simulator-ppc64.h            |  556 ++
+ js/src/jit/ppc64/Trampoline-ppc64.cpp         |  648 ++
+ js/src/jit/shared/Assembler-shared.h          |    5 +-
+ .../AtomicOperations-feeling-lucky-gcc.h      |    3 +-
+ js/src/jit/shared/CodeGenerator-shared.cpp    |    6 +-
+ js/src/jit/shared/Lowering-shared-inl.h       |    2 +-
+ js/src/js-config.mozbuild                     |    1 +
+ js/src/jsapi-tests/testJitABIcalls.cpp        |    3 +
+ js/src/jsapi-tests/testWasmReturnCalls.cpp    |   10 +-
+ js/src/jsapi-tests/testsJit.cpp               |   20 +
+ js/src/shell/js.cpp                           |   25 +
+ js/src/shell/jsshell.h                        |    3 +-
+ js/src/tests/shell/os.js                      |    8 +-
+ js/src/util/Poison.h                          |    2 +
+ js/src/wasm/WasmAnyRef.h                      |    7 +-
+ js/src/wasm/WasmBCDefs.h                      |    7 +
+ js/src/wasm/WasmBCMemory.cpp                  |   47 +-
+ js/src/wasm/WasmBCRegDefs.h                   |   12 +-
+ js/src/wasm/WasmBaselineCompile.cpp           |  148 +-
+ js/src/wasm/WasmCodegenConstants.h            |    3 +-
+ js/src/wasm/WasmCodegenTypes.cpp              |   11 +-
+ js/src/wasm/WasmCompile.cpp                   |    6 +-
+ js/src/wasm/WasmFrameIter.cpp                 |  118 +
+ js/src/wasm/WasmGC.cpp                        |    8 +
+ js/src/wasm/WasmGenerator.cpp                 |   18 +-
+ js/src/wasm/WasmIonCompile.cpp                |    2 +-
+ js/src/wasm/WasmMemory.cpp                    |    4 +-
+ js/src/wasm/WasmSignalHandlers.cpp            |   20 +-
+ js/src/wasm/WasmStacks.cpp                    |   31 +-
+ js/src/wasm/WasmStubs.cpp                     |   43 +-
+ js/src/wasm/WasmSummarizeInsn.cpp             |  163 +
+ js/src/wasm/WasmValue.cpp                     |    2 +-
+ mfbt/Assertions.h                             |    5 +
+ 108 files changed, 34442 insertions(+), 438 deletions(-)
+ create mode 100644 js/src/jit-test/tests/baseline/ppc64-branch8-16-narrow.js
+ create mode 100644 js/src/jit-test/tests/ion/mod-constant-pow2-minus-one.js
+ create mode 100644 js/src/jit-test/tests/ion/mod-pow2-negative-dividend.js
+ create mode 100644 js/src/jit-test/tests/math-min-max-corner-cases.js
+ create mode 100644 js/src/jit-test/tests/wasm/ppc64-argon2-tiering.js
+ create mode 100644 js/src/jit-test/tests/wasm/ppc64-compare-select-bench.js
+ create mode 100644 js/src/jit-test/tests/wasm/ppc64-extmul-alias.js
+ create mode 100644 js/src/jit-test/tests/wasm/ppc64-simd-vr-clobber.js
+ create mode 100644 js/src/jit-test/tests/wasm/regress-ppc64-extract-lane-ctz.js
+ create mode 100644 js/src/jit-test/tests/wasm/regress-ppc64-select-condition.js
+ create mode 100644 js/src/jit-test/tests/wasm/regress-ppc64-trap-exit-simd-save.js
+ create mode 100644 js/src/jit-test/tests/wasm/regress/bug-ppc64-simd-reduce-and-branch.js
+ create mode 100644 js/src/jit-test/tests/wasm/regress/bug-ppc64-simd-reduce-and-branch.wasm
+ create mode 100644 js/src/jit/ppc64/Architecture-ppc64.cpp
+ create mode 100644 js/src/jit/ppc64/Architecture-ppc64.h
+ create mode 100644 js/src/jit/ppc64/Assembler-ppc64.cpp
+ create mode 100644 js/src/jit/ppc64/Assembler-ppc64.h
+ create mode 100644 js/src/jit/ppc64/CodeGenerator-ppc64.cpp
+ create mode 100644 js/src/jit/ppc64/CodeGenerator-ppc64.h
+ create mode 100644 js/src/jit/ppc64/LIR-ppc64.h
+ create mode 100644 js/src/jit/ppc64/Lowering-ppc64.cpp
+ create mode 100644 js/src/jit/ppc64/Lowering-ppc64.h
+ create mode 100644 js/src/jit/ppc64/MacroAssembler-ppc64-inl.h
+ create mode 100644 js/src/jit/ppc64/MacroAssembler-ppc64.cpp
+ create mode 100644 js/src/jit/ppc64/MacroAssembler-ppc64.h
+ create mode 100644 js/src/jit/ppc64/MoveEmitter-ppc64.cpp
+ create mode 100644 js/src/jit/ppc64/MoveEmitter-ppc64.h
+ create mode 100644 js/src/jit/ppc64/SharedICHelpers-ppc64-inl.h
+ create mode 100644 js/src/jit/ppc64/SharedICHelpers-ppc64.h
+ create mode 100644 js/src/jit/ppc64/SharedICRegisters-ppc64.h
+ create mode 100644 js/src/jit/ppc64/Simulator-ppc64.cpp
+ create mode 100644 js/src/jit/ppc64/Simulator-ppc64.h
+ create mode 100644 js/src/jit/ppc64/Trampoline-ppc64.cpp
+
+diff --git a/config/check_macroassembler_style.py b/config/check_macroassembler_style.py
+index aa1a54104e26..ba73de388099 100644
+--- a/config/check_macroassembler_style.py
++++ b/config/check_macroassembler_style.py
+@@ -33,6 +33,7 @@ all_architecture_names = set([
+     "arm64",
+     "loong64",
+     "riscv64",
++    "ppc64",
+     "wasm32",
+ ])
+ all_shared_architecture_names = set([
+@@ -41,6 +42,7 @@ all_shared_architecture_names = set([
+     "arm64",
+     "loong64",
+     "riscv64",
++    "ppc64",
+     "wasm32",
+ ])
+ 
+diff --git a/js/moz.configure b/js/moz.configure
+index 26cc85622654..5310dd08506f 100644
+--- a/js/moz.configure
++++ b/js/moz.configure
+@@ -264,6 +264,7 @@ def jit_default(target, enable_portable_baseline_interp):
+         "aarch64",
+         "mips64",
+         "loongarch64",
++        "ppc64",
+         "riscv64",
+     ):
+         return True
+@@ -285,7 +286,7 @@ def report_deprecated(value):
+ # =======================================================
+ option(
+     "--enable-simulator",
+-    choices=("arm", "arm64", "mips64", "loong64", "riscv64"),
++    choices=("arm", "arm64", "mips64", "loong64", "riscv64", "ppc64"),
+     nargs=1,
+     help="Enable a JIT code simulator for the specified architecture",
+ )
+@@ -302,7 +303,7 @@ def simulator(jit_enabled, simulator_enabled, target):
+         if target.cpu != "x86":
+             die("The %s simulator only works on x86." % sim_cpu)
+ 
+-    if sim_cpu in ("arm64", "mips64", "loong64", "riscv64"):
++    if sim_cpu in ("arm64", "mips64", "loong64", "riscv64", "ppc64"):
+         if target.cpu != "x86_64" and target.cpu != "aarch64":
+             die("The %s simulator only works on x86-64 or arm64." % sim_cpu)
+ 
+@@ -315,12 +316,14 @@ set_config("JS_SIMULATOR_ARM64", simulator.arm64)
+ set_config("JS_SIMULATOR_MIPS64", simulator.mips64)
+ set_config("JS_SIMULATOR_LOONG64", simulator.loong64)
+ set_config("JS_SIMULATOR_RISCV64", simulator.riscv64)
++set_config("JS_SIMULATOR_PPC64", simulator.ppc64)
+ set_define("JS_SIMULATOR", depends_if(simulator)(lambda x: True))
+ set_define("JS_SIMULATOR_ARM", simulator.arm)
+ set_define("JS_SIMULATOR_ARM64", simulator.arm64)
+ set_define("JS_SIMULATOR_MIPS64", simulator.mips64)
+ set_define("JS_SIMULATOR_LOONG64", simulator.loong64)
+ set_define("JS_SIMULATOR_RISCV64", simulator.riscv64)
++set_define("JS_SIMULATOR_PPC64", simulator.ppc64)
+ 
+ 
+ @depends("--enable-jit", simulator, target)
+@@ -337,6 +340,8 @@ def jit_codegen(jit_enabled, simulator, target):
+         return namespace(x64=True)
+     elif target.cpu == "loongarch64":
+         return namespace(loong64=True)
++    elif target.cpu == "ppc64":
++        return namespace(ppc64=True)
+     elif target.cpu == "riscv64":
+         return namespace(riscv64=True)
+ 
+@@ -348,6 +353,7 @@ set_config("JS_CODEGEN_ARM", jit_codegen.arm)
+ set_config("JS_CODEGEN_ARM64", jit_codegen.arm64)
+ set_config("JS_CODEGEN_MIPS64", jit_codegen.mips64)
+ set_config("JS_CODEGEN_LOONG64", jit_codegen.loong64)
++set_config("JS_CODEGEN_PPC64", jit_codegen.ppc64)
+ set_config("JS_CODEGEN_RISCV64", jit_codegen.riscv64)
+ set_config("JS_CODEGEN_X86", jit_codegen.x86)
+ set_config("JS_CODEGEN_X64", jit_codegen.x64)
+@@ -358,6 +364,7 @@ set_define("JS_CODEGEN_ARM", jit_codegen.arm)
+ set_define("JS_CODEGEN_ARM64", jit_codegen.arm64)
+ set_define("JS_CODEGEN_MIPS64", jit_codegen.mips64)
+ set_define("JS_CODEGEN_LOONG64", jit_codegen.loong64)
++set_define("JS_CODEGEN_PPC64", jit_codegen.ppc64)
+ set_define("JS_CODEGEN_RISCV64", jit_codegen.riscv64)
+ set_define("JS_CODEGEN_X86", jit_codegen.x86)
+ set_define("JS_CODEGEN_X64", jit_codegen.x64)
+@@ -728,7 +735,7 @@ def default_wasm_jspi(
+         return
+ 
+     if simulator:
+-        return simulator[0] in ("arm64", "arm", "loong64", "mips64", "riscv64")
++        return simulator[0] in ("arm64", "arm", "loong64", "mips64", "ppc64", "riscv64")
+ 
+     if target.cpu in (
+         "x86_64",
+@@ -737,6 +744,7 @@ def default_wasm_jspi(
+         "arm",
+         "loongarch64",
+         "mips64",
++        "ppc64",
+         "riscv64",
+     ):
+         return True
+@@ -768,10 +776,11 @@ def wasm_jspi(value, jit_enabled, simulator, no_experimental, target):
+         "arm",
+         "loong64",
+         "mips64",
++        "ppc64",
+         "riscv64",
+     ):
+         die(
+-            "--enable-wasm-jspi is only supported for arm64/arm/loong64/mips64/riscv64 simulators"
++            "--enable-wasm-jspi is only supported for arm64/arm/loong64/mips64/ppc64/riscv64 simulators"
+         )
+ 
+     if target.cpu in (
+@@ -781,12 +790,13 @@ def wasm_jspi(value, jit_enabled, simulator, no_experimental, target):
+         "arm",
+         "loongarch64",
+         "mips64",
++        "ppc64",
+         "riscv64",
+     ):
+         return True
+ 
+     die(
+-        "--enable-wasm-jspi only possible when targeting the x86_64/x86/arm64/arm/loongarch64/mips64/riscv64 jits"
++        "--enable-wasm-jspi only possible when targeting the x86_64/x86/arm64/arm/loongarch64/mips64/ppc64/riscv64 jits"
+     )
+ 
+ 
+@@ -821,10 +831,10 @@ def default_wasm_simd(jit_enabled, simulator, target):
+     if not jit_enabled:
+         return
+ 
+-    if simulator and (simulator[0] != "arm64"):
++    if simulator and simulator[0] not in ("arm64", "ppc64"):
+         return
+ 
+-    if target.cpu in ("x86_64", "x86", "aarch64"):
++    if target.cpu in ("x86_64", "x86", "aarch64", "ppc64"):
+         return True
+ 
+ 
+@@ -849,13 +859,15 @@ def wasm_simd(value, jit_enabled, simulator, target, no_experimental):
+     if not jit_enabled:
+         die("--enable-wasm-simd requires --enable-jit")
+ 
+-    if simulator and (simulator[0] != "arm64"):
+-        die("--enable-wasm-simd is not supported for simulators, except arm64")
++    if simulator and simulator[0] not in ("arm64", "ppc64"):
++        die(
++            "--enable-wasm-simd is not supported for simulators, except arm64 and ppc64"
++        )
+ 
+-    if target.cpu in ("x86_64", "x86", "aarch64"):
++    if target.cpu in ("x86_64", "x86", "aarch64", "ppc64"):
+         return True
+ 
+-    die("--enable-wasm-simd only possible when targeting the x86_64/x86/arm64 jits")
++    die("--enable-wasm-simd only possible when targeting the x86_64/x86/arm64/ppc64 jits")
+ 
+ 
+ set_config("ENABLE_WASM_SIMD", wasm_simd)
+diff --git a/js/src/builtin/TestingFunctions.cpp b/js/src/builtin/TestingFunctions.cpp
+index be8b3d0e16b6..2291d58dc0a1 100644
+--- a/js/src/builtin/TestingFunctions.cpp
++++ b/js/src/builtin/TestingFunctions.cpp
+@@ -447,6 +447,15 @@ static bool GetBuildConfiguration(JSContext* cx, unsigned argc, Value* vp) {
+     return false;
+   }
+ 
++#ifdef JS_CODEGEN_PPC64
++  value = BooleanValue(true);
++#else
++  value = BooleanValue(false);
++#endif
++  if (!JS_SetProperty(cx, info, "ppc64", value)) {
++    return false;
++  }
++
+ #ifdef JS_CODEGEN_LOONG64
+   value = BooleanValue(true);
+ #else
+@@ -483,6 +492,15 @@ static bool GetBuildConfiguration(JSContext* cx, unsigned argc, Value* vp) {
+     return false;
+   }
+ 
++#ifdef JS_SIMULATOR_PPC64
++  value = BooleanValue(true);
++#else
++  value = BooleanValue(false);
++#endif
++  if (!JS_SetProperty(cx, info, "ppc64-simulator", value)) {
++    return false;
++  }
++
+ #ifdef MOZ_ASAN
+   value = BooleanValue(true);
+ #else
+diff --git a/js/src/irregexp/RegExpAPI.cpp b/js/src/irregexp/RegExpAPI.cpp
+index 310cd85c6a20..377509574f28 100644
+--- a/js/src/irregexp/RegExpAPI.cpp
++++ b/js/src/irregexp/RegExpAPI.cpp
+@@ -495,7 +495,10 @@ class RegExpDepthCheck final : public v8::internal::regexp::Visitor {
+ 
+   // This size is picked to be comfortably larger than any
+   // RegExp*::ToNode stack frame.
+-#if !defined(DEBUG) && !defined(MOZ_CODE_COVERAGE)
++#if defined(__powerpc64__)
++  // PPC64 ELFv2 has larger minimum stack frames.
++  static const size_t FRAME_PADDING = 256 * 4;
++#elif !defined(DEBUG) && !defined(MOZ_CODE_COVERAGE)
+   static const size_t FRAME_PADDING = 256;
+ #else
+   // Use a slightly larger padding for debug and code coverage builds.
+diff --git a/js/src/irregexp/RegExpNativeMacroAssembler.cpp b/js/src/irregexp/RegExpNativeMacroAssembler.cpp
+index ae351226797b..a396aeb3c731 100644
+--- a/js/src/irregexp/RegExpNativeMacroAssembler.cpp
++++ b/js/src/irregexp/RegExpNativeMacroAssembler.cpp
+@@ -990,8 +990,21 @@ void SMRegExpMacroAssembler::CheckBacktrackStackLimit() {
+       AbsoluteAddress(isolate()->regexp_stack()->limit_address_address()),
+       backtrack_stack_pointer_, &no_stack_overflow);
+ 
++#ifdef JS_CODEGEN_PPC64
++  // LR on PowerPC isn't a GPR, so we have to explicitly save it before
++  // calling or the regexp's return address will be clobbered.
++  masm_.xs_mflr(temp1_);
++  masm_.as_stdu(temp1_, masm_.getStackPointer(), -8);
++#endif
++
+   masm_.call(&stack_overflow_label_);
+ 
++#ifdef JS_CODEGEN_PPC64
++  masm_.as_ld(temp1_, masm_.getStackPointer(), 0);
++  masm_.xs_mtlr(temp1_);
++  masm_.as_addi(masm_.getStackPointer(), masm_.getStackPointer(), 8);
++#endif
++
+   // Exit with an exception if the call failed
+   masm_.branchTest32(Assembler::Zero, temp0_, temp0_,
+                      &exit_with_exception_label_);
+@@ -1080,6 +1093,13 @@ void SMRegExpMacroAssembler::createStackFrame() {
+   masm_.initPseudoStackPtr();
+ #endif
+ 
++#ifdef JS_CODEGEN_PPC64
++  // PPC64's link register is an SPR, not a GPR, so it cannot be included in
++  // SavedNonVolatileRegisters. Save it explicitly before the frame pointer
++  // so that abiret()'s blr can return to the caller after we restore it.
++  masm_.pushReturnAddress();
++#endif
++
+   masm_.Push(js::jit::FramePointer);
+   masm_.moveStackPtrTo(js::jit::FramePointer);
+ 
+@@ -1308,6 +1328,9 @@ void SMRegExpMacroAssembler::exitHandler() {
+   // Perform a plain Ret(), as abiret() will move SP <- PSP and that is wrong.
+   masm_.Ret(vixl::lr);
+ #else
++#  ifdef JS_CODEGEN_PPC64
++  masm_.popReturnAddress();
++#  endif
+   masm_.abiret();
+ #endif
+ 
+@@ -1351,6 +1374,11 @@ void SMRegExpMacroAssembler::stackOverflowHandler() {
+ 
+   // Adjust for the return address on the stack.
+   size_t frameOffset = sizeof(void*);
++#ifdef JS_CODEGEN_PPC64
++  // CheckBacktrackStackLimit pushes LR before calling us, so there's a
++  // second return address on the stack.
++  frameOffset += sizeof(void*);
++#endif
+ 
+   volatileRegs.takeUnchecked(temp0_);
+   volatileRegs.takeUnchecked(temp1_);
+diff --git a/js/src/jit-test/tests/baseline/ppc64-branch8-16-narrow.js b/js/src/jit-test/tests/baseline/ppc64-branch8-16-narrow.js
+new file mode 100644
+index 000000000000..fc1074a9ef8b
+--- /dev/null
++++ b/js/src/jit-test/tests/baseline/ppc64-branch8-16-narrow.js
+@@ -0,0 +1,103 @@
++// Regression test for PPC64 branch8/branch16 width-narrowing under Equal /
++// NotEqual / unsigned comparisons. Two prior bugs:
++//
++//   1. Sign-extending the load while move32(Imm32) zero-extended the imm
++//      caused spurious mismatch when the loaded byte/halfword had its high
++//      bit set (e.g. "ÀÁÂ".startsWith("ÀÁÂ") returned false because byte 0xC0
++//      sign-extended to 0xFF...C0 but the imm 0xC0 zero-extended to 0x00C0,
++//      so cmpw on the low 32 bits saw a negative vs positive value).
++//
++//   2. Always zero-extending the load broke `byte == Imm32(-1)` because -1
++//      sign-extends in the imm path: the loaded 0x000000FF didn't match the
++//      materialized 0xFFFFFFFF.
++//
++// Fix: cast the immediate to uint8/uint16 (equality + unsigned) or int8/int16
++// (signed relational) so both sides have matching bit patterns regardless of
++// how move32(Imm32) chose to materialize it. Match ARM64/LoongArch64/RISC-V.
++//
++// We exercise both byte and halfword branch paths via TypedArray loads and
++// String.prototype.startsWith with a constant search string (the original
++// failing site lowered to branch16(NotEqual, addr, Imm32(0xC1C0))).
++
++// --- Direct byte/halfword equality through TypedArray ---
++{
++  let u8 = new Uint8Array([0, 1, 0x7F, 0x80, 0xC0, 0xC1, 0xFE, 0xFF]);
++  let i8 = new Int8Array(u8.buffer);
++  let u16 = new Uint16Array([0x0000, 0x7FFF, 0x8000, 0xC1C0, 0xFFFE, 0xFFFF]);
++  let i16 = new Int16Array(u16.buffer);
++
++  // Force baseline + Ion to specialize the comparisons.
++  function eqU8(arr, idx, val) {
++    return arr[idx] === val;
++  }
++  function eqI8(arr, idx, val) {
++    return arr[idx] === val;
++  }
++  function eqU16(arr, idx, val) {
++    return arr[idx] === val;
++  }
++  function eqI16(arr, idx, val) {
++    return arr[idx] === val;
++  }
++
++  for (let i = 0; i < 200; i++) {
++    // High-bit-set bytes: bit pattern equality must hold both signed and
++    // unsigned interpretations of the immediate.
++    assertEq(eqU8(u8, 4, 0xC0), true);   // unsigned compare 0xC0 == 0xC0
++    assertEq(eqU8(u8, 4, 0xC1), false);
++    assertEq(eqU8(u8, 7, 0xFF), true);
++    assertEq(eqU8(u8, 7, -1 & 0xFF), true);   // 0xFF written as -1&0xFF
++
++    // Signed Int8 view: 0xFF is -1, 0xC0 is -64.
++    assertEq(eqI8(i8, 4, -64), true);
++    assertEq(eqI8(i8, 7, -1), true);
++    assertEq(eqI8(i8, 4, -63), false);
++
++    // Halfword variants: the original startswith failure pattern was
++    // (Latin-1 char 0xC1C0) — a 16-bit value with bit 15 set.
++    assertEq(eqU16(u16, 3, 0xC1C0), true);
++    assertEq(eqU16(u16, 3, 0xC1C1), false);
++    assertEq(eqU16(u16, 5, 0xFFFF), true);
++    assertEq(eqU16(u16, 5, -1 & 0xFFFF), true);
++
++    assertEq(eqI16(i16, 3, -15936), true);  // 0xC1C0 as i16 = -15936
++    assertEq(eqI16(i16, 5, -1), true);
++    assertEq(eqI16(i16, 5, -2), false);
++  }
++}
++
++// --- String.prototype.startsWith with a Latin-1 constant search ---
++// This was the original failing site — Ion lowers a constant search string
++// of length 1..32 into a sequence of byte-wise comparisons.
++{
++  let s = "ÀÁÂ";  // Latin-1 length 3, bytes 0xC0 0xC1 0xC2 (all high-bit set)
++  function check() {
++    return s.startsWith("ÀÁÂ");
++  }
++  for (let i = 0; i < 200; i++) {
++    assertEq(check(), true);
++  }
++
++  // Mismatch on a single high-bit byte must report not-equal.
++  let s2 = "ÀÁÃ";  // last byte 0xC3 instead of 0xC2
++  function check2() {
++    return s2.startsWith("ÀÁÂ");
++  }
++  for (let i = 0; i < 200; i++) {
++    assertEq(check2(), false);
++  }
++}
++
++// --- Signed relational comparisons still work (we kept the sign-extend path) ---
++{
++  let i8 = new Int8Array([0x7F, -1, -128, 1, 0]);
++  function ltZero(idx) {
++    return i8[idx] < 0;
++  }
++  for (let i = 0; i < 200; i++) {
++    assertEq(ltZero(0), false);  // 0x7F = +127
++    assertEq(ltZero(1), true);   // -1
++    assertEq(ltZero(2), true);   // -128
++    assertEq(ltZero(3), false);  // 1
++  }
++}
+diff --git a/js/src/jit-test/tests/gc/gcparam.js b/js/src/jit-test/tests/gc/gcparam.js
+index 51d58662193f..48e5a97c135f 100644
+--- a/js/src/jit-test/tests/gc/gcparam.js
++++ b/js/src/jit-test/tests/gc/gcparam.js
+@@ -30,7 +30,8 @@ testGetParam("chunkBytes");
+ testGetParam("helperThreadCount");
+ 
+ testChangeParam("maxBytes");
+-testChangeParam("minNurseryBytes", 16 * 1024);
++var pageSize = gcparam("systemPageSizeKB") * 1024;
++testChangeParam("minNurseryBytes", pageSize);
+ testChangeParam("maxNurseryBytes", 1024 * 1024);
+ testChangeParam("incrementalGCEnabled");
+ testChangeParam("perZoneGCEnabled");
+diff --git a/js/src/jit-test/tests/ion/mod-constant-pow2-minus-one.js b/js/src/jit-test/tests/ion/mod-constant-pow2-minus-one.js
+new file mode 100644
+index 000000000000..9028f5587c65
+--- /dev/null
++++ b/js/src/jit-test/tests/ion/mod-constant-pow2-minus-one.js
+@@ -0,0 +1,78 @@
++// Regression test for a PPC64 Ion miscompile of integer modulo by a
++// constant of the form 2^n - 1 (e.g. 65535).
++//
++// lowerModI routes `x % (2^n - 1)` to LModMaskI, whose codegen
++// (ma_mod_mask) materialized the mask 2^n - 1 with xs_li(). xs_li takes a
++// signed int16_t, so a mask of 0xFFFF was truncated to -1, corrupting the
++// digit-summing reduction. The bug only affected masks that do not fit in a
++// signed 16-bit immediate, i.e. divisors >= 65535 (n >= 16); smaller
++// 2^n - 1 divisors such as 255 were unaffected.
++//
++// The reference uses a non-constant divisor, which lowers to the
++// hardware-divide modulo path (LModI) and is therefore independent of the
++// LModMaskI codegen under test.
++
++function refmod(x, d) {
++  // d is not a constant here -> divide-based modulo, not LModMaskI.
++  return (x % d) | 0;
++}
++
++// One function per constant divisor so the divisor is a literal and the
++// LModMaskI path is selected.
++function mod255(x) { return (x % 255) | 0; }
++function mod32767(x) { return (x % 32767) | 0; }
++function mod65535(x) { return (x % 65535) | 0; }
++function mod131071(x) { return (x % 131071) | 0; }
++function mod1048575(x) { return (x % 1048575) | 0; }
++
++const cases = [
++  [mod255, 255],
++  [mod32767, 32767],
++  [mod65535, 65535],
++  [mod131071, 131071],
++  [mod1048575, 1048575],
++];
++
++// Inputs spanning small values, values with bits above the mask width
++// (so the multi-digit reduction is exercised), and negatives.
++const inputs = [];
++for (let i = 0; i < 64; i++) {
++  inputs.push(Math.imul(i, 2654435761) | 0);
++  inputs.push((i * 65535 + i) | 0);
++  inputs.push((i * 131071 - 7) | 0);
++  inputs.push(-Math.imul(i, 40503) | 0);
++}
++inputs.push(0, 1, -1, 65534, 65535, 65536, 0x7fffffff, -0x80000000);
++
++// Warm up through the tiers, then assert each constant-divisor result
++// matches the divide-based reference.
++for (let iter = 0; iter < 2000; iter++) {
++  for (const [fn, d] of cases) {
++    for (const x of inputs) {
++      assertEq(fn(x), refmod(x, d));
++    }
++  }
++}
++
++// Register-pressure variant: mirrors the shape that exposed the bug (many
++// live locals forcing the mask materialization to interact with spills).
++function pressure(buf, i) {
++  let v0 = i, v1 = i + 1, v2 = i + 2, v3 = i + 3, v4 = i + 4, v5 = i + 5;
++  let v6 = i + 6, v7 = i + 7, v8 = i + 8, v9 = i + 9, v10 = i + 10, v11 = i + 11;
++  let v12 = i + 12, v13 = i + 13, v14 = i + 14, v15 = i + 15;
++  const r = (buf[i & 63] % 65535) | 0;
++  // Keep every local live to the return without altering r.
++  const live = (v0 ^ v1 ^ v2 ^ v3 ^ v4 ^ v5 ^ v6 ^ v7 ^
++                v8 ^ v9 ^ v10 ^ v11 ^ v12 ^ v13 ^ v14 ^ v15) & 0;
++  return r + live;
++}
++
++const buf = new Int32Array(64);
++for (let i = 0; i < buf.length; i++) {
++  buf[i] = Math.imul(i, 2654435761) | 0;
++}
++for (let iter = 0; iter < 5000; iter++) {
++  for (let i = 0; i < 64; i++) {
++    assertEq(pressure(buf, i), refmod(buf[i & 63], 65535));
++  }
++}
+diff --git a/js/src/jit-test/tests/ion/mod-pow2-negative-dividend.js b/js/src/jit-test/tests/ion/mod-pow2-negative-dividend.js
+new file mode 100644
+index 000000000000..9905cc4a8f36
+--- /dev/null
++++ b/js/src/jit-test/tests/ion/mod-pow2-negative-dividend.js
+@@ -0,0 +1,71 @@
++// Regression test for a PPC64 Ion miscompile of integer modulo by a constant
++// power of two (e.g. 65536) with a negative dividend.
++//
++// lowerModI routes `x % 2^n` to LModPowTwoI, whose codegen tested the sign of
++// the dividend with branchPtr (a 64-bit compare). When the int32 dividend was
++// held zero-extended in its register, the 64-bit test misclassified a negative
++// value as non-negative and took the unmasked positive path, returning
++// `x & (2^n - 1)` instead of the correct (negative) `x % 2^n`. Fixed by using a
++// 32-bit sign test (branch32).
++//
++// The reference uses a non-constant divisor, which lowers to the divide-based
++// modulo path (LModI), independent of LModPowTwoI.
++
++function refmod(x, d) {
++  return (x % d) | 0;
++}
++
++function mod256(x) { return (x % 256) | 0; }
++function mod1024(x) { return (x % 1024) | 0; }
++function mod4096(x) { return (x % 4096) | 0; }
++function mod65536(x) { return (x % 65536) | 0; }
++function mod1048576(x) { return (x % 1048576) | 0; }
++function mod1073741824(x) { return (x % 1073741824) | 0; }
++
++const cases = [
++  [mod256, 256],
++  [mod1024, 1024],
++  [mod4096, 4096],
++  [mod65536, 65536],
++  [mod1048576, 1048576],
++  [mod1073741824, 1073741824],
++];
++
++// Heavy on negative dividends (the broken path), plus boundary values.
++const inputs = [];
++for (let i = 1; i <= 64; i++) {
++  inputs.push(-Math.imul(i, 2654435761) | 0);
++  inputs.push(-(i * 168));
++  inputs.push(-(i * 70001));
++  inputs.push(Math.imul(i, 40503) | 0);
++}
++inputs.push(0, -1, 1, -168, -65535, -65536, -65537, 168,
++            0x7fffffff, -0x80000000, -0x7fffffff);
++
++for (let iter = 0; iter < 3000; iter++) {
++  for (const [fn, d] of cases) {
++    for (const x of inputs) {
++      assertEq(fn(x), refmod(x, d));
++    }
++  }
++}
++
++// Register-pressure variant: a negative dividend produced at runtime
++// (float->int) with many live locals, mirroring the shape that exposed the bug.
++function pressure(seed) {
++  let v0 = seed, v1 = seed + 1, v2 = seed + 2, v3 = seed + 3, v4 = seed + 4;
++  let v5 = seed + 5, v6 = seed + 6, v7 = seed + 7, v8 = seed + 8, v9 = seed + 9;
++  let v10 = seed + 10, v11 = seed + 11, v12 = seed + 12, v13 = seed + 13;
++  let d0 = seed * 0.5, d1 = seed * 1.5, d2 = -seed * 2.5;
++  const neg = (Math.fround(-(Math.abs(seed) + 0.7)) | 0);
++  const r = (neg % 65536) | 0;
++  const live = (v0 ^ v1 ^ v2 ^ v3 ^ v4 ^ v5 ^ v6 ^ v7 ^ v8 ^ v9 ^
++                v10 ^ v11 ^ v12 ^ v13 ^ (d0 | 0) ^ (d1 | 0) ^ (d2 | 0)) & 0;
++  return r + live;
++}
++for (let iter = 0; iter < 5000; iter++) {
++  for (let s = 1; s <= 200; s++) {
++    const expect = ((Math.fround(-(s + 0.7)) | 0) % 65536) | 0;
++    assertEq(pressure(s), expect);
++  }
++}
+diff --git a/js/src/jit-test/tests/math-min-max-corner-cases.js b/js/src/jit-test/tests/math-min-max-corner-cases.js
+new file mode 100644
+index 000000000000..7ac2c59caeff
+--- /dev/null
++++ b/js/src/jit-test/tests/math-min-max-corner-cases.js
+@@ -0,0 +1,50 @@
++// Math.min / Math.max corner cases. Exercises the POWER9 xsminjdp /
++// xsmaxjdp J-form fast path on PPC64 (and the fcmpu/branch fallback on
++// POWER8 forced); other backends already cover this via shared fp tests
++// but the truth table is small and worth pinning explicitly.
++//
++// JS semantics (ECMA-262):
++//   - Math.max(-0, +0) === +0; Math.min(-0, +0) === -0
++//   - Math.max(-0, -0) === -0; Math.min(+0, +0) === +0
++//   - Any NaN operand → NaN
++//   - ±Inf and ordinary numerics by value
++
++function objectIsPositiveZero(v) {
++  return v === 0 && Object.is(v, 0);
++}
++function objectIsNegativeZero(v) {
++  return v === 0 && Object.is(v, -0);
++}
++
++// Direct calls — these get inlined by Ion as MMinMax intrinsics, which
++// emit the relevant min/max helper.
++function check() {
++  // Max corner cases.
++  assertEq(objectIsPositiveZero(Math.max(-0, +0)), true);
++  assertEq(objectIsPositiveZero(Math.max(+0, -0)), true);
++  assertEq(objectIsNegativeZero(Math.max(-0, -0)), true);
++  assertEq(objectIsPositiveZero(Math.max(+0, +0)), true);
++  assertEq(Number.isNaN(Math.max(NaN, 5)), true);
++  assertEq(Number.isNaN(Math.max(5, NaN)), true);
++  assertEq(Number.isNaN(Math.max(NaN, NaN)), true);
++  assertEq(Math.max(-Infinity, 5), 5);
++  assertEq(Math.max(Infinity, 5), Infinity);
++  assertEq(Math.max(1, 2), 2);
++  assertEq(Math.max(-1, -2), -1);
++  assertEq(Math.max(1.5, 2.5), 2.5);
++
++  // Min corner cases.
++  assertEq(objectIsNegativeZero(Math.min(-0, +0)), true);
++  assertEq(objectIsNegativeZero(Math.min(+0, -0)), true);
++  assertEq(objectIsNegativeZero(Math.min(-0, -0)), true);
++  assertEq(objectIsPositiveZero(Math.min(+0, +0)), true);
++  assertEq(Number.isNaN(Math.min(NaN, 5)), true);
++  assertEq(Number.isNaN(Math.min(5, NaN)), true);
++  assertEq(Math.min(-Infinity, 5), -Infinity);
++  assertEq(Math.min(Infinity, 5), 5);
++  assertEq(Math.min(1, 2), 1);
++}
++
++// Run cold (Baseline) and hot (Ion).
++check();
++for (let i = 0; i < 50000; i++) check();
+diff --git a/js/src/jit-test/tests/wasm/atomicity.js b/js/src/jit-test/tests/wasm/atomicity.js
+index 34327ec95741..ac1516083325 100644
+--- a/js/src/jit-test/tests/wasm/atomicity.js
++++ b/js/src/jit-test/tests/wasm/atomicity.js
+@@ -8,7 +8,11 @@
+ const DEBUG = 0;
+ 
+ // The longer we run, the better, really, but we don't want to time out.
+-const ITERATIONS = 100000;
++// Real PPC64 hardware retries lwarx/stwcx. reservation loops under
++// contention, which makes the default count exceed jit-test's 150 s
++// budget on POWER8 and (less so) POWER9/POWER10. Quarter the count
++// there to keep coverage while fitting the default budget.
++const ITERATIONS = getBuildConfiguration("ppc64") ? 25000 : 100000;
+ 
+ // If you change NUMWORKERS you must also change the tables for INIT, VAL, and
+ // RESULT for all the operations, below, by adding or removing bits.
+@@ -39,7 +43,7 @@ if (getCoreCount() < NUMAGENTS) {
+ 
+ if (getBuildConfiguration("arm-simulator") || getBuildConfiguration("arm64-simulator") ||
+     getBuildConfiguration("mips64-simulator") || getBuildConfiguration("riscv64-simulator") ||
+-    getBuildConfiguration("loong64-simulator"))
++    getBuildConfiguration("loong64-simulator") || getBuildConfiguration("ppc64-simulator"))
+ {
+     if (DEBUG > 0)
+         print("Atomicity test disabled on simulator");
+diff --git a/js/src/jit-test/tests/wasm/excessive-inlining.js b/js/src/jit-test/tests/wasm/excessive-inlining.js
+index 91ec710e4e46..a7d3b3211515 100644
+--- a/js/src/jit-test/tests/wasm/excessive-inlining.js
++++ b/js/src/jit-test/tests/wasm/excessive-inlining.js
+@@ -74,23 +74,26 @@ assertEq(tier2codeBytesUsed > 2000, true);
+ 
+ // But not an excessive amount.  This is the assertion that checks that
+ // the inlining-budget cutoff mechanism is working.
+-assertEq(tier2codeBytesUsed < 15000, true);
++// PPC64 generates larger code due to fixed-width 4-byte instructions,
++// multi-instruction branch stanzas, and longer constant-loading sequences.
++let tier2limit = getBuildConfiguration("ppc64") ? 25000 : 15000;
++assertEq(tier2codeBytesUsed < tier2limit, true);
+ 
+ // The thresholds above are based on the following measurements.
+ //
+ // tier1codeBytesUsed (baseline size)
+ //
+-//     x64      x32    arm64    arm32
++//     x64      x32    arm64    arm32    ppc64
+ //
+-//    1378     1010     1408     1008    --enable-debug build
+-//    1218      866     1248      856    --disable-debug build
++//    1378     1010     1408     1008     2736    --enable-debug build
++//    1218      866     1248      856            --disable-debug build
+ //
+ // tier2codeBytesUsed (optimized size), with inline-size budgeting enabled
+ //
+-//     x64      x32    arm64    arm32
++//     x64      x32    arm64    arm32    ppc64
+ //
+-//    5186     6994     7136     5472    --enable-debug build
+-//    3698     3730     5472     3888    --disable-debug build
++//    5186     6994     7136     5472    17408    --enable-debug build
++//    3698     3730     5472     3888            --disable-debug build
+ //
+ // tier2codeBytesUsed (optimized size), with inline-size budgeting disabled
+ //
+@@ -108,7 +111,7 @@ assertEq(tier2codeBytesUsed < 15000, true);
+ // (2) the optimized size will be at least 2000 bytes
+ //
+ // (3) if the inline-budget mechanism is working as intended, the optimized
+-//     size will be less than 15000 bytes
++//     size will be less than 15000 bytes (25000 on PPC64)
+ //
+ //
+ // Note (for future testing): inline-size budgeting was disabled by changing
+diff --git a/js/src/jit-test/tests/wasm/memory-oob-message.js b/js/src/jit-test/tests/wasm/memory-oob-message.js
+index 75248c6e6a56..c08e49bcc6e4 100644
+--- a/js/src/jit-test/tests/wasm/memory-oob-message.js
++++ b/js/src/jit-test/tests/wasm/memory-oob-message.js
+@@ -8,8 +8,16 @@ const hasOffsetMessage = wasmHugeMemoryEnabled();
+ 
+ function oobPattern(memIdx, byteOffset) {
+     if (hasOffsetMessage) {
++        // The reported address is whatever the kernel returned in
++        // siginfo.si_addr for the faulting instruction. Most backends emit
++        // the wasm access directly so si_addr equals byteOffset. PPC64 emits
++        // a 1-byte probing load at byteOffset + (size - 1) before each
++        // multi-byte access (to enforce wasm-spec atomicity on POWER ISA),
++        // so si_addr there can be up to 15 bytes past byteOffset.
++        const offsets = [];
++        for (let i = 0; i < 16; ++i) offsets.push(`${byteOffset + i}`);
+         return new RegExp(
+-            `out of bounds: memory ${memIdx} access at memory address ${byteOffset}`
++            `out of bounds: memory ${memIdx} access at memory address (?:${offsets.join('|')})`
+         );
+     }
+     return /index out of bounds/;
+diff --git a/js/src/jit-test/tests/wasm/ppc64-argon2-tiering.js b/js/src/jit-test/tests/wasm/ppc64-argon2-tiering.js
+new file mode 100644
+index 000000000000..04dad9240539
+--- /dev/null
++++ b/js/src/jit-test/tests/wasm/ppc64-argon2-tiering.js
+@@ -0,0 +1,124 @@
++// Test for wasm tiering correctness with argon2-style SIMD computation.
++// The argon2 fBlaMka function uses i64x2.extmul_low_i32x4_u, i64x2.shl,
++// i64x2.add, v128.xor, v128.or, i64x2.shr_u, and i8x16.shuffle.
++// A tiering bug can cause hash and verify to produce different results
++// when tier-up happens between them.
++//
++// This test runs the computation under both baseline and optimizing
++// compilers and verifies they produce identical results.
++
++var mod = new WebAssembly.Module(wasmTextToBinary(`
++  (module
++    (memory (export "mem") 10)
++    ;; Argon2 fBlaMka: a + b + 2 * trunc32(a) * trunc32(b)
++    ;; then rotations by 32, 24, 16, 63
++    (func $G_round (param i32)
++      (local v128 v128 v128 v128 v128 v128 v128 v128 v128)
++      (local.set 1 (v128.load (i32.add (local.get 0) (i32.const 0))))
++      (local.set 2 (v128.load (i32.add (local.get 0) (i32.const 16))))
++      (local.set 3 (v128.load (i32.add (local.get 0) (i32.const 32))))
++      (local.set 4 (v128.load (i32.add (local.get 0) (i32.const 48))))
++      (local.set 5 (v128.load (i32.add (local.get 0) (i32.const 64))))
++      (local.set 6 (v128.load (i32.add (local.get 0) (i32.const 80))))
++      (local.set 7 (v128.load (i32.add (local.get 0) (i32.const 96))))
++      (local.set 8 (v128.load (i32.add (local.get 0) (i32.const 112))))
++
++      ;; fBlaMka(v0, v2) + rotr32
++      (local.set 1 (i64x2.add (i64x2.add (local.get 1) (local.get 3))
++        (i64x2.shl (i64x2.extmul_low_i32x4_u
++          (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 1) (local.get 1))
++          (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 3) (local.get 3)))
++          (i32.const 1))))
++      (local.set 9 (v128.xor (local.get 7) (local.get 1)))
++      (local.set 7 (v128.or (i64x2.shl (local.get 9) (i32.const 32)) (i64x2.shr_u (local.get 9) (i32.const 32))))
++
++      ;; fBlaMka(v4, v6) + rotr24
++      (local.set 5 (i64x2.add (i64x2.add (local.get 5) (local.get 7))
++        (i64x2.shl (i64x2.extmul_low_i32x4_u
++          (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 5) (local.get 5))
++          (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 7) (local.get 7)))
++          (i32.const 1))))
++      (local.set 9 (v128.xor (local.get 3) (local.get 5)))
++      (local.set 3 (v128.or (i64x2.shl (local.get 9) (i32.const 40)) (i64x2.shr_u (local.get 9) (i32.const 24))))
++
++      ;; fBlaMka(v0, v2) + rotr16
++      (local.set 1 (i64x2.add (i64x2.add (local.get 1) (local.get 3))
++        (i64x2.shl (i64x2.extmul_low_i32x4_u
++          (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 1) (local.get 1))
++          (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 3) (local.get 3)))
++          (i32.const 1))))
++      (local.set 9 (v128.xor (local.get 7) (local.get 1)))
++      (local.set 7 (v128.or (i64x2.shl (local.get 9) (i32.const 48)) (i64x2.shr_u (local.get 9) (i32.const 16))))
++
++      ;; fBlaMka(v4, v6) + rotr63
++      (local.set 5 (i64x2.add (i64x2.add (local.get 5) (local.get 7))
++        (i64x2.shl (i64x2.extmul_low_i32x4_u
++          (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 5) (local.get 5))
++          (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 7) (local.get 7)))
++          (i32.const 1))))
++      (local.set 9 (v128.xor (local.get 3) (local.get 5)))
++      (local.set 3 (v128.or (i64x2.shl (local.get 9) (i32.const 1)) (i64x2.shr_u (local.get 9) (i32.const 63))))
++
++      (v128.store (i32.add (local.get 0) (i32.const 0)) (local.get 1))
++      (v128.store (i32.add (local.get 0) (i32.const 16)) (local.get 2))
++      (v128.store (i32.add (local.get 0) (i32.const 32)) (local.get 3))
++      (v128.store (i32.add (local.get 0) (i32.const 48)) (local.get 4))
++      (v128.store (i32.add (local.get 0) (i32.const 64)) (local.get 5))
++      (v128.store (i32.add (local.get 0) (i32.const 80)) (local.get 6))
++      (v128.store (i32.add (local.get 0) (i32.const 96)) (local.get 7))
++      (v128.store (i32.add (local.get 0) (i32.const 112)) (local.get 8)))
++
++    (func (export "hash") (param i32) (result i64)
++      (local i32)
++      ;; Init with Blake2b IV
++      (v128.store (i32.const 0) (v128.const i64x2 0x6a09e667f3bcc908 0xbb67ae8584caa73b))
++      (v128.store (i32.const 16) (v128.const i64x2 0x3c6ef372fe94f82b 0xa54ff53a5f1d36f1))
++      (v128.store (i32.const 32) (v128.const i64x2 0x510e527fade682d1 0x9b05688c2b3e6c1f))
++      (v128.store (i32.const 48) (v128.const i64x2 0x1f83d9abfb41bd6b 0x5be0cd19137e2179))
++      (v128.store (i32.const 64) (v128.const i64x2 0x0123456789abcdef 0xfedcba9876543210))
++      (v128.store (i32.const 80) (v128.const i64x2 0xdeadbeefcafebabe 0x1122334455667788))
++      (v128.store (i32.const 96) (v128.const i64x2 0xaabbccdd11223344 0x5566778899aabbcc))
++      (v128.store (i32.const 112) (v128.const i64x2 0xddeeff0011223344 0x5566778899aabbcc))
++      (local.set 1 (i32.const 0))
++      (block (loop
++        (call $G_round (i32.const 0))
++        (local.set 1 (i32.add (local.get 1) (i32.const 1)))
++        (br_if 1 (i32.ge_u (local.get 1) (local.get 0)))
++        (br 0)))
++      (i64.xor (i64.load (i32.const 0))
++        (i64.xor (i64.load (i32.const 8))
++          (i64.xor (i64.load (i32.const 16))
++            (i64.xor (i64.load (i32.const 24))
++              (i64.xor (i64.load (i32.const 32))
++                (i64.xor (i64.load (i32.const 40))
++                  (i64.xor (i64.load (i32.const 48))
++                    (i64.xor (i64.load (i32.const 56))
++                      (i64.xor (i64.load (i32.const 64))
++                        (i64.xor (i64.load (i32.const 72))
++                          (i64.xor (i64.load (i32.const 80))
++                            (i64.xor (i64.load (i32.const 88))
++                              (i64.xor (i64.load (i32.const 96))
++                                (i64.xor (i64.load (i32.const 104))
++                                  (i64.xor (i64.load (i32.const 112))
++                                    (i64.load (i32.const 120))))))))))))))))))
++  )
++`));
++
++var inst = new WebAssembly.Instance(mod);
++
++// Get a reference result from the first call.
++var reference = inst.exports.hash(100);
++
++// Run many times to trigger tier-up, then verify result stays the same.
++var pass = true;
++for (var i = 0; i < 1000; i++) {
++    var r = inst.exports.hash(100);
++    if (r !== reference) {
++        pass = false;
++        throw new Error("Tiering mismatch at iteration " + i +
++            ": got 0x" + BigInt.asUintN(64, r).toString(16) +
++            ", expected 0x" + BigInt.asUintN(64, reference).toString(16));
++    }
++}
++
++assertEq(pass, true);
+diff --git a/js/src/jit-test/tests/wasm/ppc64-compare-select-bench.js b/js/src/jit-test/tests/wasm/ppc64-compare-select-bench.js
+new file mode 100644
+index 000000000000..c11ce713f514
+--- /dev/null
++++ b/js/src/jit-test/tests/wasm/ppc64-compare-select-bench.js
+@@ -0,0 +1,70 @@
++// |jit-test| skip-if: true
++//
++// Benchmark only, not a correctness test. Invoke manually as shown below.
++//
++// Microbenchmark for wasm compare+select fusion on PPC64.
++//
++// Run with:
++//   $JS --wasm-compiler=optimizing \
++//       js/src/jit-test/tests/wasm/ppc64-compare-select-bench.js
++//
++// Prints timings for four variants (i32, i64, f32, f64) that exercise a
++// tight loop of N select-on-compare operations. Used to decide whether
++// specializing lowerWasmCompareAndSelect beyond Int32 is worth the code.
++//
++// The kernel is a 10-stage select chain so the per-op overhead dominates
++// the loop frame. Each iteration touches 10 compare+select ops plus
++// ~trivial address math.
++
++const N_ITERS = 1_000_000;
++
++function buildModule(kind) {
++  const types = {i32: ['i32', 'i32', 'i32.lt_s'],
++                 u32: ['i32', 'i32', 'i32.lt_u'],
++                 i64: ['i64', 'i64', 'i64.lt_s'],
++                 f32: ['f32', 'i32', 'f32.lt'],
++                 f64: ['f64', 'i32', 'f64.lt']}[kind];
++  const [ty, iterTy, cmpOp] = types;
++  // Load a, b; compute chain of (b < a ? b : a) 10 times per iter.
++  const stage = `
++    (local.set $a
++      (select (result ${ty})
++        (local.get $b) (local.get $a)
++        (${cmpOp} (local.get $b) (local.get $a))))`;
++  const body = Array(10).fill(stage).join('\n');
++  const text = `
++    (module
++      (func (export "run") (param $n i32) (result ${ty})
++        (local $i i32) (local $a ${ty}) (local $b ${ty})
++        (local.set $a (${ty}.const ${kind === 'f32' || kind === 'f64' ? '3.14' : '12345'}))
++        (local.set $b (${ty}.const ${kind === 'f32' || kind === 'f64' ? '2.71' : '67890'}))
++        (loop $L
++          ${body}
++          (local.set $i (i32.add (local.get $i) (i32.const 1)))
++          (br_if $L (i32.lt_s (local.get $i) (local.get $n))))
++        (local.get $a)))`;
++  return new WebAssembly.Module(wasmTextToBinary(text));
++}
++
++function bench(kind) {
++  const inst = new WebAssembly.Instance(buildModule(kind));
++  // Warmup — ensure Ion compiles.
++  for (let i = 0; i < 3; i++) inst.exports.run(N_ITERS);
++  const t0 = dateNow();
++  const res = inst.exports.run(N_ITERS);
++  const t1 = dateNow();
++  return {ms: t1 - t0, result: res};
++}
++
++const kinds = ['i32', 'u32', 'i64', 'f32', 'f64'];
++const runs = 5;
++print(`\nwasm compare+select microbench (${N_ITERS.toLocaleString()} iters, 10 ops/iter):`);
++print(`  Each timing is the best of ${runs} runs.\n`);
++for (const kind of kinds) {
++  const samples = [];
++  for (let i = 0; i < runs; i++) samples.push(bench(kind).ms);
++  samples.sort((a, b) => a - b);
++  const best = samples[0];
++  const median = samples[(runs / 2) | 0];
++  print(`  ${kind.padEnd(4)} best=${best.toFixed(1)}ms  median=${median.toFixed(1)}ms  (samples: ${samples.map(s => s.toFixed(0)).join(',')})`);
++}
+diff --git a/js/src/jit-test/tests/wasm/ppc64-extmul-alias.js b/js/src/jit-test/tests/wasm/ppc64-extmul-alias.js
+new file mode 100644
+index 000000000000..2aa9507751b6
+--- /dev/null
++++ b/js/src/jit-test/tests/wasm/ppc64-extmul-alias.js
+@@ -0,0 +1,107 @@
++// Regression test for PPC64 i64x2.extmul_{low,high}_i32x4_{s,u} when the
++// Ion register allocator picks dest == rhs.
++//
++// On PPC64 LE, the old implementation extracted lanes via mtvsrd/mfvsrd and
++// wrote the low-lane product to dest before reading rhs for the high lane.
++// `mtvsrd XT, RA` leaves DW1 of XT undefined (POWER9 zeros it), so when
++// dest aliased rhs the high-lane extract from rhs read garbage, producing
++// zero in the high i64 lane. On POWER8 the ExtractLaneToGPR fallback
++// additionally clobbered ScratchSimd128Reg between the two extracts.
++//
++// The loop below, discovered via wasm-reduce from argon2.wasm, reliably
++// reproduced the miscompile: the result's high i64 lane went to 0 on
++// POWER9 Ion / garbage on POWER8 Ion, while baseline kept the correct
++// value (lane1 = 48*48 = 2304 in the final iteration).
++
++var mod = new WebAssembly.Module(wasmTextToBinary(`
++  (module
++    (memory (export "mem") 1)
++    (func (export "run_u") (param $out i32)
++      (local $i i32) (local $v4 v128) (local $v5 v128) (local $v9 v128)
++      (loop
++        (local.set $v9
++          (i64x2.add
++            (v128.const i32x4 1 0 0 0)
++            (i64x2.extmul_low_i32x4_u (local.get $v5) (local.get $v9))))
++        (local.set $v4 (local.get $v9))
++        (local.set $v5 (local.get $v4))
++        (v128.store (i32.const 0) (local.get $v5))
++        (local.set $i (i32.add (local.get $i) (i32.const 1)))
++        (br_if 0 (i32.ne (local.get $i) (i32.const 8))))
++      (v128.store (local.get $out) (local.get $v9)))
++
++    (func (export "run_s") (param $out i32)
++      (local $i i32) (local $v v128)
++      (local.set $v (v128.const i32x4 2 3 5 7))
++      (loop
++        ;; Force dest==rhs aliasing: v = extmul_low_i32x4_s(const, v).
++        (local.set $v
++          (i64x2.extmul_low_i32x4_s
++            (v128.const i32x4 2 3 5 7)
++            (local.get $v)))
++        (local.set $i (i32.add (local.get $i) (i32.const 1)))
++        (br_if 0 (i32.ne (local.get $i) (i32.const 2))))
++      (v128.store (local.get $out) (local.get $v)))
++
++    (func (export "run_high_u") (param $out i32)
++      (local $i i32) (local $v v128)
++      (local.set $v (v128.const i32x4 0 0 2 3))
++      (loop
++        (local.set $v
++          (i64x2.extmul_high_i32x4_u
++            (v128.const i32x4 0 0 2 3)
++            (local.get $v)))
++        (local.set $i (i32.add (local.get $i) (i32.const 1)))
++        (br_if 0 (i32.ne (local.get $i) (i32.const 2))))
++      (v128.store (local.get $out) (local.get $v)))
++
++    (func (export "run_high_s") (param $out i32)
++      (local $i i32) (local $v v128)
++      (local.set $v (v128.const i32x4 0 0 2 3))
++      (loop
++        (local.set $v
++          (i64x2.extmul_high_i32x4_s
++            (v128.const i32x4 0 0 2 3)
++            (local.get $v)))
++        (local.set $i (i32.add (local.get $i) (i32.const 1)))
++        (br_if 0 (i32.ne (local.get $i) (i32.const 2))))
++      (v128.store (local.get $out) (local.get $v))))
++`));
++
++function runAndCheck(inst) {
++  inst.exports.run_u(0);
++  // After 8 iterations, the value in memory should have lane1 == 2304 = 0x900.
++  // Bytes 8-15 (i64 lane 1, little-endian) = 0x0000000000000900.
++  var buf = new Uint8Array(inst.exports.mem.buffer, 0, 16);
++  var hex = Array.from(buf).map(b => b.toString(16).padStart(2,'0')).join('');
++  // Expect bytes 8-9 = "00 09" and bytes 10-15 = "00 00 00 00 00 00".
++  assertEq(hex.slice(16, 32), "0009000000000000");
++
++  inst.exports.run_s(16);
++  // After 2 iterations of v = extmul_low_s(const(2,3,5,7), v) starting v=(2,3,5,7):
++  //   iter 1: i64x2 lane0 = 2*2 = 4, lane1 = 3*3 = 9.
++  //           v becomes i32x4 [4, 0, 9, 0] (each i64 lane occupies two i32 lanes).
++  //   iter 2: extmul_low_s reads i32 lanes 0, 1 of v = (4, 0).
++  //           i64 lane0 = 2*4 = 8; i64 lane1 = 3*0 = 0.
++  var buf2 = new Uint8Array(inst.exports.mem.buffer, 16, 16);
++  var hex2 = Array.from(buf2).map(b => b.toString(16).padStart(2,'0')).join('');
++  assertEq(hex2, "08000000000000000000000000000000");
++
++  inst.exports.run_high_u(32);
++  // v = (0, 0, 2, 3). extmul_high picks lanes 2 and 3.
++  //   iter 1: lane2_prod = 2*2 = 4; lane3_prod = 3*3 = 9. Result stored at bytes 0-7 (lane2_prod) and 8-15 (lane3_prod).
++  //   iter 2: v now has i64x2 lane0 = 4, lane1 = 9, i.e. i32x4 lanes [4, 0, 9, 0].
++  //           extmul_high_u(const(0,0,2,3), v) reads lanes 2, 3 of both:
++  //           const lane2 = 2, lane3 = 3; v lane2 = 9, lane3 = 0.
++  //           result: lane2_prod = 2*9 = 18 at bytes 0-7; lane3_prod = 3*0 = 0 at bytes 8-15.
++  var buf3 = new Uint8Array(inst.exports.mem.buffer, 32, 16);
++  var hex3 = Array.from(buf3).map(b => b.toString(16).padStart(2,'0')).join('');
++  assertEq(hex3, "12000000000000000000000000000000");
++
++  inst.exports.run_high_s(48);
++  var buf4 = new Uint8Array(inst.exports.mem.buffer, 48, 16);
++  var hex4 = Array.from(buf4).map(b => b.toString(16).padStart(2,'0')).join('');
++  assertEq(hex4, "12000000000000000000000000000000");
++}
++
++runAndCheck(new WebAssembly.Instance(mod));
+diff --git a/js/src/jit-test/tests/wasm/ppc64-simd-vr-clobber.js b/js/src/jit-test/tests/wasm/ppc64-simd-vr-clobber.js
+new file mode 100644
+index 000000000000..d5f79a1840a6
+--- /dev/null
++++ b/js/src/jit-test/tests/wasm/ppc64-simd-vr-clobber.js
+@@ -0,0 +1,179 @@
++// |jit-test| skip-if: !wasmSimdEnabled()
++//
++// Regression tests for PPC64 SIMD helpers that use VR1..VR5 as undeclared
++// scratch and silently corrupt live wasm v128 values the register allocator
++// has placed in those VRs.
++//
++// Background: PPC64 Simd128 lives in VR0..VR31. VR0 is non-allocatable
++// (= ScratchSimd128Reg); VR1..VR31 are allocatable. The helpers below
++// historically used VR1..VR5 as undeclared scratch:
++//
++//   negInt8x16, negInt16x8                    : clobber VR1 (all CPUs)
++//   negInt32x4, negInt64x2 (POWER8 fallback)  : clobber VR1 (POWER8 only)
++//   extAddPairwiseInt8x16  (signed/unsigned)  : clobber VR1, VR2, VR3
++//   extAddPairwiseInt16x8  (signed/unsigned)  : clobber VR1, VR2, VR3
++//   unsignedWidenHighInt32x4                  : clobber VR1
++//
++// Each test:
++//   - loads `nLive` "preserve" v128 values from memory at offsets 16..16+16*nLive
++//   - loads ONE additional "input" v128 = repeat(0x18) at offset 128
++//   - applies the suspect helper to the input
++//   - stores the nLive preserved values back to memory at offsets 0..16*nLive
++//   - stores the helper result at offset 16*nLive
++//
++// Without the fix, one of the preserved locals (whichever the allocator
++// placed in the clobbered VR) reads back as the staged input value (0x18)
++// instead of its original. With the fix (the helper using ScratchSimd128Scope
++// or proper VR-namespace emit), all preserved locals retain their values.
++
++const PRESERVE_PATTERNS = [0xA1, 0xB2, 0xC3, 0xD4, 0xE5, 0xF6, 0x07, 0x29];
++const INPUT_BYTE = 0x18;
++
++function init(mem) {
++  // Slots at offset 16, 32, ..., 16+16*7 hold the preserve patterns.
++  for (let slot = 0; slot < PRESERVE_PATTERNS.length; slot++) {
++    for (let i = 0; i < 16; i++) {
++      mem[16 + slot * 16 + i] = PRESERVE_PATTERNS[slot];
++    }
++  }
++  // The helper input is at offset 128 (= 16 + 16*7 + 16 = 144? no, 16 + 16*8 = 144).
++  // Use a fixed offset PAST the preserve area. With nLive max 7, preserve uses
++  // 16..(16+16*7-1) = 16..127. Input goes at 144 to leave a 16-byte gap.
++  const INPUT_OFFSET = 144;
++  for (let i = 0; i < 16; i++) mem[INPUT_OFFSET + i] = INPUT_BYTE;
++}
++
++function repeat(byte) {
++  const a = new Array(16);
++  for (let i = 0; i < 16; i++) a[i] = byte;
++  return a;
++}
++
++// Verify nLive preserved slots match PRESERVE_PATTERNS at output offsets
++// 0..16*nLive, and that the result slot at 16*nLive matches `expectedResult`.
++function check(opName, mem, nLive, expectedResult) {
++  for (let slot = 0; slot < nLive; slot++) {
++    for (let i = 0; i < 16; i++) {
++      const got = mem[slot * 16 + i];
++      const want = PRESERVE_PATTERNS[slot];
++      assertEq(got, want,
++               `${opName}: live slot ${slot} byte ${i}: got 0x${got.toString(16)}, expected 0x${want.toString(16)} (allocator-clobbered VR?)`);
++    }
++  }
++  for (let i = 0; i < 16; i++) {
++    const got = mem[nLive * 16 + i];
++    const want = expectedResult[i];
++    assertEq(got, want,
++             `${opName}: result byte ${i}: got 0x${got.toString(16)}, expected 0x${want.toString(16)}`);
++  }
++}
++
++// Build a wasm module that:
++//  - loads `nLive` preserve v128 locals from memory at offsets 16..16*nLive
++//  - loads ONE input v128 from offset 144
++//  - applies `op` to the input
++//  - stores all `nLive + 1` v128 values back to memory at offsets 0..16*nLive
++function buildModule(op, nLive) {
++  const localDecls = [];
++  const initLoads = [];
++  const finalStores = [];
++  for (let i = 0; i < nLive; i++) {
++    localDecls.push(`(local $v${i} v128)`);
++    initLoads.push(`(local.set $v${i} (v128.load (i32.const ${16 + i * 16})))`);
++    finalStores.push(`(v128.store (i32.const ${i * 16}) (local.get $v${i}))`);
++  }
++  // The helper input + result.
++  localDecls.push(`(local $input v128)`);
++  initLoads.push(`(local.set $input (v128.load (i32.const 144)))`);
++  finalStores.push(`(v128.store (i32.const ${nLive * 16}) (local.get $input))`);
++
++  const text = `
++    (module
++      (memory (export "mem") 1)
++      (func (export "run")
++        ${localDecls.join('\n        ')}
++        ${initLoads.join('\n        ')}
++        (local.set $input (${op} (local.get $input)))
++        ${finalStores.join('\n        ')}
++      )
++    )`;
++  return new WebAssembly.Module(wasmTextToBinary(text));
++}
++
++function runOne(opName, op, nLive, expectedResult) {
++  const mod = buildModule(op, nLive);
++  const inst = new WebAssembly.Instance(mod);
++  const mem = new Uint8Array(inst.exports.mem.buffer);
++  // Run many times so Baseline + Ion both see it.
++  for (let warm = 0; warm < 50; warm++) {
++    init(mem);
++    inst.exports.run();
++    check(opName, mem, nLive, expectedResult);
++  }
++}
++
++// ---- Negate helpers ----
++//
++// Input lane = 0x18 = 24. neg(24) = -24.
++// i8x16.neg : -24 mod 256 = 232 = 0xE8 per byte.
++// i16x8.neg : lane = 0x1818 = 6168, neg = -6168 mod 65536 = 0xE7E8.
++//             Memory LE: per i16 lane bytes 0xE8 0xE7.
++// i32x4.neg : lane = 0x18181818 = 404232216, neg = 0xE7E7E7E8.
++//             Memory LE: per i32 lane bytes 0xE8 0xE7 0xE7 0xE7.
++// i64x2.neg : lane = 0x1818181818181818, neg = 0xE7E7E7E7E7E7E7E8.
++//             Memory LE: per i64 lane bytes 0xE8 0xE7 0xE7 0xE7 0xE7 0xE7 0xE7 0xE7.
++
++runOne("i8x16.neg", "i8x16.neg", 4, repeat(0xE8));
++runOne("i16x8.neg", "i16x8.neg", 4,
++       [0xE8,0xE7, 0xE8,0xE7, 0xE8,0xE7, 0xE8,0xE7,
++        0xE8,0xE7, 0xE8,0xE7, 0xE8,0xE7, 0xE8,0xE7]);
++runOne("i32x4.neg", "i32x4.neg", 4,
++       [0xE8,0xE7,0xE7,0xE7, 0xE8,0xE7,0xE7,0xE7,
++        0xE8,0xE7,0xE7,0xE7, 0xE8,0xE7,0xE7,0xE7]);
++runOne("i64x2.neg", "i64x2.neg", 4,
++       [0xE8,0xE7,0xE7,0xE7,0xE7,0xE7,0xE7,0xE7,
++        0xE8,0xE7,0xE7,0xE7,0xE7,0xE7,0xE7,0xE7]);
++
++// ---- extAddPairwise helpers ----
++//
++// extadd_pairwise reads adjacent pairs and widens-then-sums them.
++// Input = repeat(0x18) = 24.
++// i16x8.extadd_pairwise_i8x16_s : 24 + 24 = 48 = 0x0030 per i16 lane.
++//                                  Memory LE: 0x30 0x00 per lane × 8 lanes.
++// i16x8.extadd_pairwise_i8x16_u : same since input is positive.
++// i32x4.extadd_pairwise_i16x8_s : i16 lane = 0x1818 = 6168, sum = 12336 = 0x00003030.
++//                                  Memory LE: 0x30 0x30 0x00 0x00 per lane × 4 lanes.
++// i32x4.extadd_pairwise_i16x8_u : same since input is positive.
++
++runOne("i16x8.extadd_pairwise_i8x16_s",
++       "i16x8.extadd_pairwise_i8x16_s", 4,
++       [0x30,0x00, 0x30,0x00, 0x30,0x00, 0x30,0x00,
++        0x30,0x00, 0x30,0x00, 0x30,0x00, 0x30,0x00]);
++
++runOne("i16x8.extadd_pairwise_i8x16_u",
++       "i16x8.extadd_pairwise_i8x16_u", 4,
++       [0x30,0x00, 0x30,0x00, 0x30,0x00, 0x30,0x00,
++        0x30,0x00, 0x30,0x00, 0x30,0x00, 0x30,0x00]);
++
++runOne("i32x4.extadd_pairwise_i16x8_s",
++       "i32x4.extadd_pairwise_i16x8_s", 4,
++       [0x30,0x30,0x00,0x00, 0x30,0x30,0x00,0x00,
++        0x30,0x30,0x00,0x00, 0x30,0x30,0x00,0x00]);
++
++runOne("i32x4.extadd_pairwise_i16x8_u",
++       "i32x4.extadd_pairwise_i16x8_u", 4,
++       [0x30,0x30,0x00,0x00, 0x30,0x30,0x00,0x00,
++        0x30,0x30,0x00,0x00, 0x30,0x30,0x00,0x00]);
++
++// ---- unsignedWidenHighInt32x4 ----
++//
++// i64x2.extend_high_i32x4_u: take the high two i32 lanes (lanes 2 and 3) of
++// the input, zero-extend each to i64, lay them out as i64x2.
++// Input lane = 0x18181818 (positive, =404232216).
++// Result: two i64 lanes, each = 0x0000000018181818.
++// Memory LE: per i64 lane bytes 0x18 0x18 0x18 0x18 0x00 0x00 0x00 0x00.
++
++runOne("i64x2.extend_high_i32x4_u",
++       "i64x2.extend_high_i32x4_u", 4,
++       [0x18,0x18,0x18,0x18,0x00,0x00,0x00,0x00,
++        0x18,0x18,0x18,0x18,0x00,0x00,0x00,0x00]);
+diff --git a/js/src/jit-test/tests/wasm/profiling.js b/js/src/jit-test/tests/wasm/profiling.js
+index f4872b07cde8..ccd9690a262f 100644
+--- a/js/src/jit-test/tests/wasm/profiling.js
++++ b/js/src/jit-test/tests/wasm/profiling.js
+@@ -117,6 +117,13 @@ for (let type of ['f32', 'f64']) {
+         if (getBuildConfiguration("arm64")) {
+             continue;
+         }
++        // PPC64 inlines ceil/floor/trunc as frip/frim/friz (see
++        // Assembler-ppc64.h HasRoundInstruction), so no builtin thunk
++        // frames exist to profile. `nearest` still goes through the
++        // thunk because PPC64's frin is not IEEE round-to-even.
++        if (getBuildConfiguration("ppc64") && func !== 'nearest') {
++            continue;
++        }
+         test(`(module
+             (func (export "") (param ${type}) (result ${type})
+                 local.get 0
+diff --git a/js/src/jit-test/tests/wasm/regress-ppc64-extract-lane-ctz.js b/js/src/jit-test/tests/wasm/regress-ppc64-extract-lane-ctz.js
+new file mode 100644
+index 000000000000..e2cf5def541e
+--- /dev/null
++++ b/js/src/jit-test/tests/wasm/regress-ppc64-extract-lane-ctz.js
+@@ -0,0 +1,49 @@
++// |jit-test| --wasm-compiler=optimizing; skip-if: !wasmSimdEnabled()
++//
++// Regression test for a PPC64 i32x4.extract_lane canonicalization bug.
++//
++// ExtractLaneToGPR leaves the adjacent lane in the high 32 bits of the GPR for
++// the unshifted lanes (0 and 2), so extractLaneInt32x4 must sign-extend its i32
++// result (as the i8x16/i16x8 extracts do). Without that, a consumer that reads
++// the full 64-bit register sees garbage in the high half. The POWER8 i32.ctz
++// emulation is such a consumer: its 64-bit neg/and. zero-check disagrees with
++// its 32-bit cntlzw, so ctz of a zero lane sitting next to a nonzero neighbour
++// returned -1 instead of 32.
++//
++// The vector comes from memory (runtime, not constant-foldable) and is passed
++// through a SIMD op so the extract is a genuine vector-register extract. Run
++// under MOZ_PPC64_FORCE_POWER8=1 to exercise the emulated ctz path; in every
++// other mode this is simply a correctness check.
++
++const ins = wasmEvalText(`(module
++  (memory (export "mem") 1)
++  (func $v (result v128)
++    ;; identity AND keeps the value in a vector register and forces a real
++    ;; extractLaneInt32x4 rather than an extract-of-load fold.
++    (v128.and (v128.load (i32.const 0)) (v128.const i32x4 -1 -1 -1 -1)))
++  (func (export "ctz0") (result i32) (i32.ctz (i32x4.extract_lane 0 (call $v))))
++  (func (export "ctz1") (result i32) (i32.ctz (i32x4.extract_lane 1 (call $v))))
++  (func (export "ctz2") (result i32) (i32.ctz (i32x4.extract_lane 2 (call $v))))
++  (func (export "ctz3") (result i32) (i32.ctz (i32x4.extract_lane 3 (call $v))))
++  (func (export "sext0") (result i64) (i64.extend_i32_s (i32x4.extract_lane 0 (call $v))))
++  (func (export "sext2") (result i64) (i64.extend_i32_s (i32x4.extract_lane 2 (call $v))))
++)`).exports;
++
++const mem = new Int32Array(ins.mem.buffer);
++function setLanes(a, b, c, d) { mem[0] = a; mem[1] = b; mem[2] = c; mem[3] = d; }
++
++// Each lane = 0 surrounded by nonzero neighbours: ctz must be 32, never -1.
++setLanes(0, -1, -1, -1); assertEq(ins.ctz0(), 32);
++setLanes(-1, 0, -1, -1); assertEq(ins.ctz1(), 32);
++setLanes(-1, -1, 0, -1); assertEq(ins.ctz2(), 32);
++setLanes(-1, -1, -1, 0); assertEq(ins.ctz3(), 32);
++
++// Nonzero lanes: ctz of the lane value, regardless of neighbours.
++setLanes(0x10, -1, 0x100000, -1);
++assertEq(ins.ctz0(), 4);
++assertEq(ins.ctz2(), 20);
++
++// A negative lane must sign-extend correctly (the canonicalization is extsw).
++setLanes(-2, 7, -3, 7);
++assertEq(ins.sext0(), -2n);
++assertEq(ins.sext2(), -3n);
+diff --git a/js/src/jit-test/tests/wasm/regress-ppc64-select-condition.js b/js/src/jit-test/tests/wasm/regress-ppc64-select-condition.js
+new file mode 100644
+index 000000000000..c38975dce859
+--- /dev/null
++++ b/js/src/jit-test/tests/wasm/regress-ppc64-select-condition.js
+@@ -0,0 +1,30 @@
++// |jit-test| --wasm-compiler=optimizing; skip-if: !wasmSimdEnabled()
++//
++// Regression test for a PPC64 wasm Ion miscompile of `select` with a 32-bit
++// condition. visitWasmSelect tested the i32 condition with a 64-bit compare
++// (cmpdi / branchTestPtr). When the condition was zero in its low 32 bits but
++// had garbage in the high 32 bits (as can happen under register pressure), the
++// 64-bit test read it as non-zero and select returned the wrong operand.
++//
++// Here the condition `$x3` is 0; `select($x8, -952809828, $x3)` must therefore
++// return -952809828. The surrounding SIMD shuffle/bitselect/swizzle chain
++// supplies the v128 register pressure that exposed the bug.
++
++const wat = `(module (func (export "f") (result i64)
++  (local $x3 i32)(local $x7 i32)(local $x8 i32)
++  (local $w0 v128)(local $w1 v128)(local $w2 v128)(local $w3 v128)
++  (local $w4 v128)(local $w5 v128)(local $w6 v128)(local $w7 v128)
++  (local.set $w0 (v128.const i32x4 1708443454 1532218695 2107423610 -1265775005))
++  (local.set $w2 (v128.const i32x4 -752312355 -625530572 -844666500 832036408))
++  (local.set $w7 (v128.const i32x4 115003496 -970441117 -43225935 1874128204))
++  (local.set $w4 (i8x16.shuffle 15 18 13 2 6 22 20 8 19 10 12 8 11 5 6 28 (local.get $w7) (local.get $w3)))
++  (local.set $w6 (v128.bitselect (local.get $w4) (local.get $w0) (local.get $w7)))
++  (local.set $w1 (v128.const i32x4 -1635025264 -629784132 1517869852 1651771825))
++  (local.set $w7 (v128.bitselect (local.get $w6) (local.get $w2) (local.get $w2)))
++  (local.set $w6 (i8x16.swizzle (local.get $w1) (local.get $w7)))
++  (local.set $x3 (i32x4.extract_lane 2 (local.get $w6)))
++  (local.set $x7 (select (local.get $x8) (i32.const -952809828) (local.get $x3)))
++  (i64.extend_i32_s (local.get $x7))))`;
++
++const ins = new WebAssembly.Instance(new WebAssembly.Module(wasmTextToBinary(wat)));
++assertEq(ins.exports.f(), -952809828n);
+diff --git a/js/src/jit-test/tests/wasm/regress-ppc64-trap-exit-simd-save.js b/js/src/jit-test/tests/wasm/regress-ppc64-trap-exit-simd-save.js
+new file mode 100644
+index 000000000000..4887f8df119c
+--- /dev/null
++++ b/js/src/jit-test/tests/wasm/regress-ppc64-trap-exit-simd-save.js
+@@ -0,0 +1,64 @@
++// |jit-test| exitstatus: 0; skip-if: !wasmSimdEnabled()
++//
++// Regression test for the PPC64 wasm trap exit losing live v128 state.
++//
++// On PPC64, doubles live in the FPRs (VSR0-31) while wasm v128 values live in
++// the VRs (VSR32-63) -- disjoint physical pools. The trap exit's
++// RegsToPreserve used AllDoubleMask only, so a trap firing while a v128 was
++// live resumed with whatever the C++ interrupt path's libc left in the VRs
++// (glibc's misaligned vector memcpy leaves lvsl alignment-control byte
++// patterns there). Interrupt checks fire via traps at loop back-edges, where
++// a loop-carried v128 accumulator is exactly what is live.
++//
++// The loop below keeps an i32x4 accumulator live across every back-edge while
++// interrupts fire repeatedly; the callback does large misaligned copies to
++// pull libc's vector memcpy through the VRs. On an unfixed build (real
++// silicon; the simulator's VRs are insulated from native libc) the
++// accumulator comes back holding garbage and the final lane values are wrong.
++
++const ins = wasmEvalText(`(module
++  (func (export "run") (param $n i32) (result i32)
++    (local $acc v128)
++    (block $done
++      (loop $top
++        (br_if $done (i32.eqz (local.get $n)))
++        (local.set $acc (i32x4.add (local.get $acc) (v128.const i32x4 1 2 3 4)))
++        (local.set $n (i32.sub (local.get $n) (i32.const 1)))
++        (br $top)))
++    ;; Fold the four lanes so any lane corruption shows up.
++    (i32.xor
++      (i32.xor (i32x4.extract_lane 0 (local.get $acc))
++               (i32.rotl (i32x4.extract_lane 1 (local.get $acc)) (i32.const 8)))
++      (i32.xor (i32.rotl (i32x4.extract_lane 2 (local.get $acc)) (i32.const 16))
++               (i32.rotl (i32x4.extract_lane 3 (local.get $acc)) (i32.const 24)))))
++)`).exports;
++
++// Misaligned big copies drive glibc's lvsl/vperm memcpy path on PPC.
++const big = new Uint8Array(1 << 20);
++const src = big.subarray(1, (1 << 19) + 1);
++const dst = new Uint8Array(1 << 19);
++
++let fires = 0;
++function onInterrupt() {
++  fires++;
++  for (let i = 0; i < 4; i++) {
++    dst.set(src);
++  }
++  if (fires < 25) {
++    timeout(0.02, onInterrupt);
++  }
++  return true;
++}
++
++function expected(n) {
++  const r = (x, k) => ((x << k) | (x >>> (32 - k))) | 0;
++  const l = [n | 0, (2 * n) | 0, (3 * n) | 0, (4 * n) | 0];
++  return ((l[0] ^ r(l[1], 8)) ^ (r(l[2], 16) ^ r(l[3], 24))) | 0;
++}
++
++const N = 1 << 26;
++timeout(0.02, onInterrupt);
++const got = ins.run(N);
++// Cancel any pending watchdog before finishing.
++timeout(-1);
++assertEq(got, expected(N));
+diff --git a/js/src/jit-test/tests/wasm/regress/bug-ppc64-simd-reduce-and-branch.js b/js/src/jit-test/tests/wasm/regress/bug-ppc64-simd-reduce-and-branch.js
+new file mode 100644
+index 000000000000..b7ec0d9548bb
+--- /dev/null
++++ b/js/src/jit-test/tests/wasm/regress/bug-ppc64-simd-reduce-and-branch.js
+@@ -0,0 +1,7 @@
++// Regression test for a PPC64-specific wasm Ion crash in
++// CodeGenerator::visitWasmReduceAndBranchSimd128 — it called
++// LBlock::label() directly on the branch targets without going through
++// skipTrivialBlocks(), so a trivial goto-only successor tripped
++// LBlock::label()'s !isTrivial() assertion. Reduced from grantkot.com/poly
++// with wasm-reduce. Triggers the bug under --wasm-compiler=optimizing.
++new WebAssembly.Module(os.file.readFile(scriptdir + "/bug-ppc64-simd-reduce-and-branch.wasm", "binary"));
+diff --git a/js/src/jit-test/tests/wasm/simd/bug1946618.js b/js/src/jit-test/tests/wasm/simd/bug1946618.js
+index cc02d0d8dfd7..fcf3a2a35e82 100644
+--- a/js/src/jit-test/tests/wasm/simd/bug1946618.js
++++ b/js/src/jit-test/tests/wasm/simd/bug1946618.js
+@@ -48,7 +48,12 @@ for (let op of ["f32x4.relaxed_min", "f32x4.relaxed_max",
+     // baseline.
+     let result1 = i.exports.variant1();
+     let result2 = i.exports.variant2();
+-    if (getBuildConfiguration("arm64")) {
++    if (getBuildConfiguration("ppc64")) {
++      // PPC64: xvminsp/xvmaxsp always returns the non-NaN operand,
++      // regardless of operand order. Both variants give zero (non-NaN).
++      assertEq(result1, 0);
++      assertEq(result2, 0);
++    } else if (getBuildConfiguration("arm64")) {
+       // The relaxed_min/max operation appears to propagate NaNs symmetrically
+       // from either arg
+       assertEq(result1, 65535);
+diff --git a/js/src/jit-test/tests/wasm/simd/ion-analysis.js b/js/src/jit-test/tests/wasm/simd/ion-analysis.js
+index d12af6e6fbc9..335f831ff6a9 100644
+--- a/js/src/jit-test/tests/wasm/simd/ion-analysis.js
++++ b/js/src/jit-test/tests/wasm/simd/ion-analysis.js
+@@ -12,6 +12,7 @@
+ // generates the expected result.
+ 
+ var isArm64 = getBuildConfiguration("arm64");
++var isPPC64 = getBuildConfiguration("ppc64");
+ 
+ // 32-bit permutation that is not a rotation.
+ let perm32x4_pattern = [4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 0, 1, 2, 3];
+@@ -846,7 +847,7 @@ for ( let [ty128,size] of [['i8x16',1], ['i16x8',2], ['i32x4',4]] ) {
+     let ops = { all_true: allTrue, any_true: anyTrue, bitmask };
+ 
+     for ( let op of ['any_true', 'all_true', 'bitmask'] ) {
+-        let folded = op != 'bitmask' || (size == 2 && !isArm64);
++        let folded = op != 'bitmask' || (size == 2 && !isArm64 && !isPPC64);
+         let operation = op == 'any_true' ? 'v128.any_true' : `${ty128}.${op}`;
+         let positive =
+             wasmCompile(
+@@ -898,12 +899,12 @@ for ( let [ty128,size] of [['i8x16',1], ['i16x8',2], ['i32x4',4]] ) {
+ 
+ // Bitselect with constant mask folded into shuffle operation
+ 
+-if (!isArm64) {
++if (!isArm64 && !isPPC64) {
+   wasmCompile(`
+   (module (func (param v128) (param v128) (result v128)
+     (v128.bitselect (local.get 0) (local.get 1) (v128.const i8x16 0 -1 -1 0 0 0 0 0 -1 -1 -1 -1 -1 -1 0 0))))
+   `);
+-      assertEq(wasmSimdAnalysis(), "shuffle -> blend 8x16");  
++      assertEq(wasmSimdAnalysis(), "shuffle -> blend 8x16");
+ }
+ 
+ // Library
+diff --git a/js/src/jit/Assembler.h b/js/src/jit/Assembler.h
+index 97c2e337625b..cb7244776605 100644
+--- a/js/src/jit/Assembler.h
++++ b/js/src/jit/Assembler.h
+@@ -19,6 +19,8 @@
+ #  include "jit/loong64/Assembler-loong64.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ #  include "jit/riscv64/Assembler-riscv64.h"
++#elif defined(JS_CODEGEN_PPC64)
++#  include "jit/ppc64/Assembler-ppc64.h"
+ #elif defined(JS_CODEGEN_WASM32)
+ #  include "jit/wasm32/Assembler-wasm32.h"
+ #elif defined(JS_CODEGEN_NONE)
+diff --git a/js/src/jit/BaselineIC.cpp b/js/src/jit/BaselineIC.cpp
+index c356538a024e..5ab631838f0e 100644
+--- a/js/src/jit/BaselineIC.cpp
++++ b/js/src/jit/BaselineIC.cpp
+@@ -120,6 +120,8 @@ AllocatableGeneralRegisterSet BaselineICAvailableGeneralRegs(size_t numInputs) {
+   MOZ_ASSERT(!regs.has(PseudoStackPointer));
+   MOZ_ASSERT(!regs.has(RealStackPointer));
+   MOZ_ASSERT(!regs.has(ICTailCallReg));
++#elif defined(JS_CODEGEN_PPC64)
++  regs.take(ICTailCallReg);
+ #endif
+   regs.take(ICStubReg);
+ 
+diff --git a/js/src/jit/CacheIRCompiler.cpp b/js/src/jit/CacheIRCompiler.cpp
+index 4eb952e497e3..ee4888495103 100644
+--- a/js/src/jit/CacheIRCompiler.cpp
++++ b/js/src/jit/CacheIRCompiler.cpp
+@@ -10302,6 +10302,14 @@ bool CacheIRCompiler::emitConcatStringsResult(StringOperandId lhsId,
+     liveRegs.add(ICTailCallReg);
+ #endif
+     liveRegs.takeUnchecked(output.valueReg());
++
++#ifdef JS_CODEGEN_PPC64
++    // On PPC64, LR is an SPR, not a GPR, so ICTailCallReg is a regular
++    // GPR that does not shadow LR. The inner bctrl will clobber LR, so
++    // save/restore it explicitly.
++    masm.xs_mflr(r0);
++    masm.push(r0);
++#endif
+     masm.PushRegsInMask(liveRegs);
+ 
+     // The stub expects lhs in CallTempReg0 and rhs in CallTempReg1.
+@@ -10322,11 +10330,19 @@ bool CacheIRCompiler::emitConcatStringsResult(StringOperandId lhsId,
+     masm.branchTestPtr(Assembler::Zero, CallTempReg5, CallTempReg5, &vmCall);
+     masm.tagValue(JSVAL_TYPE_STRING, CallTempReg5, output.valueReg());
+     masm.PopRegsInMask(liveRegs);
++#ifdef JS_CODEGEN_PPC64
++    masm.pop(r0);
++    masm.xs_mtlr(r0);
++#endif
+     masm.jump(&done);
+ 
+     masm.bind(&vmCall);
+     masm.setFramePushed(framePushed);
+     masm.PopRegsInMask(liveRegs);
++#ifdef JS_CODEGEN_PPC64
++    masm.pop(r0);
++    masm.xs_mtlr(r0);
++#endif
+   }
+ 
+   {
+diff --git a/js/src/jit/CodeGenerator.cpp b/js/src/jit/CodeGenerator.cpp
+index a1c01409e9f7..2a2c6007aec0 100644
+--- a/js/src/jit/CodeGenerator.cpp
++++ b/js/src/jit/CodeGenerator.cpp
+@@ -2519,6 +2519,12 @@ static bool PrepareAndExecuteRegExp(MacroAssembler& masm, Register regexp,
+   masm.computeEffectiveAddress(Address(FramePointer, ioOffset), temp2);
+   masm.PushRegsInMask(volatileRegs);
+   masm.setupUnalignedABICall(temp3);
++#if defined(JS_CODEGEN_PPC64)
++  // temp1 aliases argregs on this platform, so we need to reuse temp3
++  // or we'll stomp on the code pointer when we pass the first ABI argument.
++  masm.movePtr(codePointer, temp3);
++  codePointer = temp3;
++#endif
+   masm.passABIArg(temp2);
+   masm.callWithABI(codePointer);
+   masm.storeCallInt32Result(temp1);
+diff --git a/js/src/jit/CodeGenerator.h b/js/src/jit/CodeGenerator.h
+index 58c047dea41b..3781b9595dfd 100644
+--- a/js/src/jit/CodeGenerator.h
++++ b/js/src/jit/CodeGenerator.h
+@@ -23,6 +23,8 @@
+ #  include "jit/loong64/CodeGenerator-loong64.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ #  include "jit/riscv64/CodeGenerator-riscv64.h"
++#elif defined(JS_CODEGEN_PPC64)
++#  include "jit/ppc64/CodeGenerator-ppc64.h"
+ #elif defined(JS_CODEGEN_WASM32)
+ #  include "jit/wasm32/CodeGenerator-wasm32.h"
+ #elif defined(JS_CODEGEN_NONE)
+diff --git a/js/src/jit/EffectiveAddressAnalysis.cpp b/js/src/jit/EffectiveAddressAnalysis.cpp
+index e1bd1bd045ef..88697c06907c 100644
+--- a/js/src/jit/EffectiveAddressAnalysis.cpp
++++ b/js/src/jit/EffectiveAddressAnalysis.cpp
+@@ -60,7 +60,7 @@ static bool OffsetIsSmallEnough(int32_t imm) {
+   // `movn #imm`.  arm32 is similar.
+   return imm >= -0xFFFF && imm <= 0xFFFF;
+ #elif defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_LOONG64) || \
+-    defined(JS_CODEGEN_MIPS64)
++    defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_PPC64)
+   return imm >= -0xFFF && imm <= 0xFFF;
+ #elif defined(JS_CODEGEN_WASM32) || defined(JS_CODEGEN_NONE)
+   return true;
+diff --git a/js/src/jit/ExecutableAllocator.cpp b/js/src/jit/ExecutableAllocator.cpp
+index 340a63964b52..c9336fe8ec4e 100644
+--- a/js/src/jit/ExecutableAllocator.cpp
++++ b/js/src/jit/ExecutableAllocator.cpp
+@@ -306,13 +306,19 @@ void ExecutableAllocator::poisonCode(JSRuntime* rt,
+     }
+   }
+ 
+-  // Make the pools executable again and drop references. We don't flush the
+-  // ICache here to not add extra overhead.
++  // Make the pools executable again and drop references. On architectures with
++  // incoherent ICache (PPC64), we must flush to prevent stale instruction
++  // execution when code regions are reused after sweeping.
+   for (size_t i = 0; i < ranges.length(); i++) {
+     ExecutablePool* pool = ranges[i].pool;
+     if (pool->isMarked()) {
++#ifdef JS_CODEGEN_PPC64
++      reprotectPool(rt, pool, ProtectionSetting::Executable,
++                    MustFlushICache::Yes);
++#else
+       reprotectPool(rt, pool, ProtectionSetting::Executable,
+                     MustFlushICache::No);
++#endif
+       pool->unmark();
+     }
+     pool->release();
+diff --git a/js/src/jit/FlushICache.cpp b/js/src/jit/FlushICache.cpp
+index d3b1657a6be2..9590687c9803 100644
+--- a/js/src/jit/FlushICache.cpp
++++ b/js/src/jit/FlushICache.cpp
+@@ -13,7 +13,8 @@
+ #  include "jit/arm64/vixl/Simulator-vixl.h"
+ #endif
+ 
+-#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64)
++#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64) || \
++    defined(JS_CODEGEN_PPC64)
+ 
+ #  ifdef __linux__
+ #    include <linux/version.h>
+diff --git a/js/src/jit/FlushICache.h b/js/src/jit/FlushICache.h
+index af79da356ee5..58396f62ae0d 100644
+--- a/js/src/jit/FlushICache.h
++++ b/js/src/jit/FlushICache.h
+@@ -21,7 +21,7 @@ inline void FlushICache(void* code, size_t size) {
+ }
+ #elif (defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64)) ||  \
+     defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+-    defined(JS_CODEGEN_RISCV64)
++    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ 
+ // Invalidate the given code range from the icache. This will also flush the
+ // execution context for this core. If this code is to be executed on another
+@@ -37,7 +37,7 @@ inline void FlushICache(void* code, size_t size) { MOZ_CRASH(); }
+ #  error "Unknown architecture!"
+ #endif
+ 
+-#if (defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64)) ||      \
++#if (defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64)) ||    \
+     defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+     defined(JS_CODEGEN_RISCV64)
+ 
+@@ -55,10 +55,11 @@ inline void FlushExecutionContext() { MOZ_CRASH(); }
+ inline bool CanFlushExecutionContextForAllThreads() { MOZ_CRASH(); }
+ inline void FlushExecutionContextForAllThreads() { MOZ_CRASH(); }
+ 
+-#elif defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64)
++#elif defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64) || \
++    defined(JS_CODEGEN_PPC64)
+ 
+-// ARM and ARM64 must flush the instruction pipeline of the current core
+-// before executing newly JIT'ed code. This will remove any stale data from
++// ARM, ARM64, and PPC64 must flush the instruction pipeline of the current
++// core before executing newly JIT'ed code. This will remove any stale data from
+ // the pipeline that may have referenced invalidated instructions.
+ //
+ // `FlushICache` will perform this for the thread that compiles the code, but
+diff --git a/js/src/jit/GenerateABIFunctionType.py b/js/src/jit/GenerateABIFunctionType.py
+index 04be10d1de2a..815427ec6771 100644
+--- a/js/src/jit/GenerateABIFunctionType.py
++++ b/js/src/jit/GenerateABIFunctionType.py
+@@ -538,6 +538,102 @@ def riscv64_simulator_dispatch(func_types):
+     return contents
+ 
+ 
++# PPC64 ELFv2 ABI: 8 int arg regs (r3-r10), 13 FP arg regs (f1-f13).
++# Each floating-point argument consumes BOTH a float-arg slot AND a
++# general-purpose-register shadow slot (capped at 8 GPR slots), matching
++# what GCC and the JIT's ABIArgGenerator do for ELFv2 PPC64LE. Without
++# the shadow, integer args following a float go to the wrong register
++# at the call boundary, producing a use-after-free / wrong-pointer crash
++# in the C callee. (Verified empirically by disassembling
++# NumberBigIntCompare(double, BigInt*) on real PPC64: BigInt* is read
++# from r4, not r3.)
++def ppc64_args(func_type):
++    contents = ""
++    numIntArgRegs = 8
++    numFloatArgRegs = 13
++    intRegIndex = 0
++    floatRegIndex = 0
++    stackOffset = 0
++    for i, arg in enumerate(func_type["args"]):
++        if i != 0:
++            contents += ", "
++
++        if arg == "General":
++            if intRegIndex == numIntArgRegs:
++                contents += f"sp_[{stackOffset}]"
++                stackOffset += 1
++            else:
++                contents += f"a{intRegIndex}_"
++                intRegIndex += 1
++        elif arg == "Int32":
++            if intRegIndex == numIntArgRegs:
++                contents += f"I32(sp_[{stackOffset}])"
++                stackOffset += 1
++            else:
++                contents += f"I32(a{intRegIndex}_)"
++                intRegIndex += 1
++        elif arg == "Int64":
++            if intRegIndex == numIntArgRegs:
++                contents += f"sp_[{stackOffset}]"
++                stackOffset += 1
++            else:
++                contents += f"a{intRegIndex}_"
++                intRegIndex += 1
++        elif arg == "Float32":
++            if floatRegIndex == numFloatArgRegs:
++                contents += f"*mozilla::BitwiseCast<float*>(sp_[{stackOffset}])"
++                stackOffset += 1
++            else:
++                contents += f"f{floatRegIndex}_s"
++                floatRegIndex += 1
++            # ELFv2: FP arg also consumes a GPR shadow slot.
++            if intRegIndex < numIntArgRegs:
++                intRegIndex += 1
++        elif arg == "Float64":
++            if floatRegIndex == numFloatArgRegs:
++                contents += f"mozilla::BitwiseCast<double>(sp_[{stackOffset}])"
++                stackOffset += 1
++            else:
++                contents += f"f{floatRegIndex}_d"
++                floatRegIndex += 1
++            # ELFv2: FP arg also consumes a GPR shadow slot.
++            if intRegIndex < numIntArgRegs:
++                intRegIndex += 1
++    assert intRegIndex <= numIntArgRegs
++    assert floatRegIndex <= numFloatArgRegs
++    return contents
++
++
++def ppc64_simulator_dispatch(func_types):
++    contents = ""
++    for func_type in func_types:
++        args = ppc64_args(func_type)
++        contents += f"case js::jit::Args_{func_type_name(func_type)}: {{\\\n"
++        contents += f"  auto target = reinterpret_cast<Prototype_{func_type_name(func_type)}>(nativeFn);\\\n"
++        ret = func_type["ret"]
++        if ret == "Void":
++            contents += f"  target({args});\\\n"
++        else:
++            contents += f"  auto ret = target({args});\\\n"
++        if ret == "Void":
++            pass
++        elif ret == "General":
++            contents += "  setCallResult(ret);\\\n"
++        elif ret == "Int32":
++            contents += "  setCallResult(I64(ret));\\\n"
++        elif ret == "Int64":
++            contents += "  setCallResult(ret);\\\n"
++        elif ret == "Float32":
++            contents += "  setCallResultFloat(ret);\\\n"
++        elif ret == "Float64":
++            contents += "  setCallResultDouble(ret);\\\n"
++        else:
++            raise ValueError(f"Unknown ret type: {ret}")
++        contents += "  break;\\\n"
++        contents += "}\\\n"
++    return contents
++
++
+ def main(c_out, yaml_path):
+     func_types = load_yaml(yaml_path)
+ 
+@@ -581,4 +677,8 @@ def main(c_out, yaml_path):
+     contents += riscv64_simulator_dispatch(func_types)
+     contents += "\n"
+ 
++    contents += "#define ABI_FUNCTION_TYPE_PPC64_SIM_DISPATCH \\\n"
++    contents += ppc64_simulator_dispatch(func_types)
++    contents += "\n"
++
+     generate_header(c_out, "jit_ABIFunctionTypeGenerated_h", contents)
+diff --git a/js/src/jit/JitContext.cpp b/js/src/jit/JitContext.cpp
+index 79b22d9f249f..d399ddd36fd4 100644
+--- a/js/src/jit/JitContext.cpp
++++ b/js/src/jit/JitContext.cpp
+@@ -121,6 +121,10 @@ bool jit::InitializeJit() {
+   RVFlags::Init();
+ #endif
+ 
++#ifdef JS_CODEGEN_PPC64
++  PPC64Flags::Init();
++#endif
++
+ #ifndef JS_CODEGEN_NONE
+   MOZ_ASSERT(js::jit::CPUFlagsHaveBeenComputed());
+ #endif
+diff --git a/js/src/jit/JitFrames.cpp b/js/src/jit/JitFrames.cpp
+index 3653af3a21f4..bbd1376dec69 100644
+--- a/js/src/jit/JitFrames.cpp
++++ b/js/src/jit/JitFrames.cpp
+@@ -1824,7 +1824,12 @@ Value SnapshotIterator::allocationValue(const RValueAllocation& alloc,
+       return DoubleValue(fromRegister<double>(alloc.fpuReg()));
+ 
+     case RValueAllocation::FLOAT32_REG:
++#if defined(JS_CODEGEN_PPC64) || defined(JS_CODEGEN_RISCV64)
++      return Float32Value(
++          float(fromRegister<double>(alloc.fpuReg().asDouble())));
++#else
+       return Float32Value(fromRegister<float>(alloc.fpuReg()));
++#endif
+ 
+     case RValueAllocation::FLOAT32_STACK:
+       return Float32Value(ReadFrameFloat32Slot(fp_, alloc.stackOffset()));
+@@ -2625,7 +2630,12 @@ uintptr_t MachineState::read(Register reg) const {
+ 
+ template <typename T>
+ T MachineState::read(FloatRegister reg) const {
++#if defined(JS_CODEGEN_PPC64) || defined(JS_CODEGEN_RISCV64)
++  // PPC64/RISCV64 always store FloatRegisters as 64-bit doubles.
++  MOZ_ASSERT(reg.size() >= sizeof(T));
++#else
+   MOZ_ASSERT(reg.size() == sizeof(T));
++#endif
+ 
+ #if !defined(JS_CODEGEN_NONE) && !defined(JS_CODEGEN_WASM32)
+   if (state_.is<BailoutState>()) {
+diff --git a/js/src/jit/JitFrames.h b/js/src/jit/JitFrames.h
+index ac7005a5fcfc..490834e62fec 100644
+--- a/js/src/jit/JitFrames.h
++++ b/js/src/jit/JitFrames.h
+@@ -322,6 +322,16 @@ enum class ExceptionResumeKind : int32_t {
+ 
+ // Data needed to recover from an exception.
+ struct ResumeFromException {
++#if defined(JS_CODEGEN_PPC64)
++  // This struct is built on the stack as part of exception returns. Because
++  // it goes right on top of the stack, an ABI-compliant routine can wreck
++  // it, so we implement a minimum Power ISA linkage area (four doublewords).
++  void* _ppc_sp_;
++  void* _ppc_cr_;
++  void* _ppc_lr_;
++  void* _ppc_toc_;
++#endif
++
+   uint8_t* framePointer;
+   uint8_t* stackPointer;
+   uint8_t* target;
+@@ -373,7 +383,7 @@ struct ResumeFromException {
+   }
+ };
+ 
+-#if defined(JS_CODEGEN_ARM64)
++#if defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_PPC64)
+ static_assert(sizeof(ResumeFromException) % 16 == 0,
+               "ResumeFromException should be aligned");
+ #endif
+diff --git a/js/src/jit/LIR.cpp b/js/src/jit/LIR.cpp
+index 2f89fb407349..a9f634b7fcc1 100644
+--- a/js/src/jit/LIR.cpp
++++ b/js/src/jit/LIR.cpp
+@@ -779,8 +779,8 @@ bool LMoveGroup::add(LAllocation from, LAllocation to, LDefinition::Type type) {
+     // CodeGeneratorShared::CodeGeneratorShared and in general everywhere
+     // SimdMemoryAignment is used.  Likely, alignment requirements will return.
+ #   if defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64) || \
+-       defined(JS_CODEGEN_ARM64)
+-      // No need for any check on x86/x64/arm64.
++       defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_PPC64)
++      // No need for any check on x86/x64/arm64/ppc64.
+ #   else
+ #     error "Need to consider SIMD alignment on this target."
+       // The following code may be of use if we need alignment checks on
+diff --git a/js/src/jit/LIR.h b/js/src/jit/LIR.h
+index 3f4efeda7955..3354cb96b0cb 100644
+--- a/js/src/jit/LIR.h
++++ b/js/src/jit/LIR.h
+@@ -200,7 +200,7 @@ class LUse : public LAllocation {
+   static const uint32_t POLICY_BITS = 3;
+   static const uint32_t POLICY_SHIFT = 0;
+   static const uint32_t POLICY_MASK = (1 << POLICY_BITS) - 1;
+-#ifdef JS_CODEGEN_ARM64
++#if defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_PPC64)
+   static const uint32_t REG_BITS = 7;
+ #else
+   static const uint32_t REG_BITS = 6;
+@@ -619,12 +619,18 @@ class LDefinition {
+   Type type() const { return (Type)((bits_ >> TYPE_SHIFT) & TYPE_MASK); }
+ 
+   static bool isFloatRegCompatible(Type type, FloatRegister reg) {
++#if defined(JS_CODEGEN_PPC64) || defined(JS_CODEGEN_RISCV64)
++    if (type == FLOAT32 || type == DOUBLE) {
++      return reg.isSingle() || reg.isDouble();
++    }
++#else
+     if (type == FLOAT32) {
+       return reg.isSingle();
+     }
+     if (type == DOUBLE) {
+       return reg.isDouble();
+     }
++#endif
+     MOZ_ASSERT(type == SIMD128);
+     return reg.isSimd128();
+   }
+@@ -2292,6 +2298,8 @@ AnyRegister LAllocation::toAnyRegister() const {
+ #  include "jit/loong64/LIR-loong64.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ #  include "jit/riscv64/LIR-riscv64.h"
++#elif defined(JS_CODEGEN_PPC64)
++#  include "jit/ppc64/LIR-ppc64.h"
+ #elif defined(JS_CODEGEN_MIPS64)
+ #  include "jit/mips-shared/LIR-mips-shared.h"
+ #  include "jit/mips64/LIR-mips64.h"
+diff --git a/js/src/jit/LIROps.yaml b/js/src/jit/LIROps.yaml
+index 315ff5fd5348..7fbea9e2ebec 100644
+--- a/js/src/jit/LIROps.yaml
++++ b/js/src/jit/LIROps.yaml
+@@ -2210,7 +2210,7 @@
+     oldval: WordSized
+     newval: WordSized
+   # Needs additional temps on LL/SC platforms to extract/insert bits of word.
+-#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+   num_temps: 4
+ #else
+   num_temps: 1
+@@ -2224,7 +2224,7 @@
+     index: WordSized
+     value: WordSized
+   # Needs additional temps on LL/SC platforms to extract/insert bits of word.
+-#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+   num_temps: 4
+ #else
+   num_temps: 1
+@@ -2238,7 +2238,7 @@
+     index: WordSized
+     value: WordSized
+   # Needs additional temps on LL/SC platforms to extract/insert bits of word.
+-#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+   num_temps: 4
+ #else
+   num_temps: 2
+@@ -2255,7 +2255,7 @@
+   # Needs additional temps on LL/SC platforms to extract/insert bits of word.
+ #if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64)
+   num_temps: 1
+-#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+   num_temps: 3
+ #endif
+   mir_op: AtomicTypedArrayElementBinop
+@@ -3066,7 +3066,7 @@
+   operands:
+     ptr: WordSized
+     memoryBase: WordSized
+-#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+   num_temps: 1
+ #endif
+   mir_op: true
+@@ -3078,7 +3078,7 @@
+     memoryBase: WordSized
+ #ifdef JS_CODEGEN_ARM
+   num_temps: 2
+-#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+   num_temps: 1
+ #endif
+   mir_op: WasmLoad
+@@ -3088,7 +3088,7 @@
+     ptr: WordSized
+     value: WordSized
+     memoryBase: WordSized
+-#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+   num_temps: 1
+ #endif
+   mir_op: true
+@@ -3098,7 +3098,7 @@
+     ptr: WordSized
+     value: Int64
+     memoryBase: WordSized
+-#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+   num_temps: 1
+ #endif
+   mir_op: WasmStore
+@@ -3128,7 +3128,7 @@
+     memoryBase: WordSized
+ #ifdef JS_CODEGEN_X86
+   num_temps: 1
+-#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+   # Temp that may be used on LL/SC platforms for extract/insert bits of word.
+   num_temps: 3
+ #endif
+@@ -3142,7 +3142,7 @@
+     memoryBase: WordSized
+ #ifdef JS_CODEGEN_X86
+   num_temps: 1
+-#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+   # Temp that may be used on LL/SC platforms for extract/insert bits of word.
+   num_temps: 3
+ #endif
+@@ -3154,7 +3154,7 @@
+     ptr: WordSized
+     value: WordSized
+     memoryBase: WordSized
+-#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+   # Temp that may be used on LL/SC platforms for extract/insert bits of word.
+   num_temps: 3
+ #elifdef JS_CODEGEN_X86
+@@ -3171,7 +3171,7 @@
+     ptr: WordSized
+     value: WordSized
+     memoryBase: WordSized
+-#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+   # Temp that may be used on LL/SC platforms for extract/insert bits of word.
+   num_temps: 3
+ #elif defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64)
+@@ -4424,6 +4424,64 @@
+   mir_op: WasmAtomicExchangeHeap
+ #endif
+ 
++#ifdef JS_CODEGEN_PPC64
++- name: DivOrModI64
++  gen_boilerplate: false
++
++- name: UDivOrMod
++  gen_boilerplate: false
++
++- name: UDivOrModI64
++  gen_boilerplate: false
++
++- name: ModMaskI
++  result_type: WordSized
++  operands:
++    input: WordSized
++  arguments:
++    shift: int32_t
++  num_temps: 2
++  mir_op: Mod
++
++- name: WasmTruncateToInt64
++  result_type: Int64
++  operands:
++    input: WordSized
++  mir_op: true
++
++- name: Int64ToFloatingPoint
++  result_type: WordSized
++  operands:
++    input: Int64
++  mir_op: true
++
++- name: WasmCompareExchangeI64
++  result_type: Int64
++  operands:
++    ptr: WordSized
++    oldValue: Int64
++    newValue: Int64
++    memoryBase: WordSized
++  mir_op: WasmCompareExchangeHeap
++
++- name: WasmAtomicBinopI64
++  result_type: Int64
++  operands:
++    ptr: WordSized
++    value: Int64
++    memoryBase: WordSized
++  num_temps64: 1
++  mir_op: WasmAtomicBinopHeap
++
++- name: WasmAtomicExchangeI64
++  result_type: Int64
++  operands:
++    ptr: WordSized
++    value: Int64
++    memoryBase: WordSized
++  mir_op: WasmAtomicExchangeHeap
++#endif
++
+ #ifdef JS_CODEGEN_RISCV64
+ - name: UDiv
+   result_type: WordSized
+diff --git a/js/src/jit/Label.h b/js/src/jit/Label.h
+index 061bf978d26f..2a49ded9c967 100644
+--- a/js/src/jit/Label.h
++++ b/js/src/jit/Label.h
+@@ -23,7 +23,7 @@ struct LabelBase {
+   uint32_t offset_ : 31;
+ 
+ #if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+-    defined(JS_CODEGEN_RISCV64)
++    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+  public:
+ #endif
+   static const uint32_t INVALID_OFFSET = 0x7fffffff;  // UINT31_MAX.
+diff --git a/js/src/jit/Lowering.cpp b/js/src/jit/Lowering.cpp
+index 9c1c4b0df491..e3fe71ea9c83 100644
+--- a/js/src/jit/Lowering.cpp
++++ b/js/src/jit/Lowering.cpp
+@@ -1174,7 +1174,7 @@ void LIRGenerator::visitTest(MTest* test) {
+ 
+ #if defined(ENABLE_WASM_SIMD) &&                           \
+     (defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64) || \
+-     defined(JS_CODEGEN_ARM64))
++     defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_PPC64))
+   // Check if the operand for this test is an any_true/all_true SIMD operation.
+   // If it is, we want to emit an LWasmReduceAndBranchSimd128 node to avoid
+   // generating an intermediate boolean result.
+diff --git a/js/src/jit/Lowering.h b/js/src/jit/Lowering.h
+index b4f133758eb6..d973a68989b5 100644
+--- a/js/src/jit/Lowering.h
++++ b/js/src/jit/Lowering.h
+@@ -23,6 +23,8 @@
+ #  include "jit/loong64/Lowering-loong64.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ #  include "jit/riscv64/Lowering-riscv64.h"
++#elif defined(JS_CODEGEN_PPC64)
++#  include "jit/ppc64/Lowering-ppc64.h"
+ #elif defined(JS_CODEGEN_WASM32)
+ #  include "jit/wasm32/Lowering-wasm32.h"
+ #elif defined(JS_CODEGEN_NONE)
+diff --git a/js/src/jit/MacroAssembler-inl.h b/js/src/jit/MacroAssembler-inl.h
+index 4747a22e171b..d7385df895d5 100644
+--- a/js/src/jit/MacroAssembler-inl.h
++++ b/js/src/jit/MacroAssembler-inl.h
+@@ -39,6 +39,8 @@
+ #  include "jit/loong64/MacroAssembler-loong64-inl.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ #  include "jit/riscv64/MacroAssembler-riscv64-inl.h"
++#elif defined(JS_CODEGEN_PPC64)
++#  include "jit/ppc64/MacroAssembler-ppc64-inl.h"
+ #elif defined(JS_CODEGEN_WASM32)
+ #  include "jit/wasm32/MacroAssembler-wasm32-inl.h"
+ #elif !defined(JS_CODEGEN_NONE)
+diff --git a/js/src/jit/MacroAssembler.cpp b/js/src/jit/MacroAssembler.cpp
+index eb95d6c9e2c4..5b28e811c88d 100644
+--- a/js/src/jit/MacroAssembler.cpp
++++ b/js/src/jit/MacroAssembler.cpp
+@@ -6128,7 +6128,7 @@ static void MoveDataBlock(MacroAssembler& masm, Register base, int32_t from,
+   static constexpr Register scratch = ABINonArgReg0;
+   masm.push(scratch);
+ #elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+-    defined(JS_CODEGEN_RISCV64)
++    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+   UseScratchRegisterScope temps(masm);
+   Register scratch = temps.Acquire();
+ #elif !defined(JS_CODEGEN_NONE)
+@@ -6315,6 +6315,12 @@ static void CollapseWasmFrameFast(MacroAssembler& masm,
+ 
+ #ifdef JS_USE_LINK_REGISTER
+   // RA is already in its place, just move stack.
++#  ifdef JS_CODEGEN_PPC64
++  // PPC64's LR is not a GPR, so WasmTailCallRAScratchReg is a normal GPR
++  // (r14). We must explicitly move it to LR so the callee's prologue
++  // (pushReturnAddress) saves the correct return address.
++  masm.xs_mtlr(tempForRA);
++#  endif
+   masm.addToStackPtr(Imm32(framePushedAtStart + newArgDest));
+ #else
+   // Push RA to new frame: store RA, restore temp, and move stack.
+@@ -6463,6 +6469,12 @@ static void CollapseWasmFrameSlow(MacroAssembler& masm,
+ #ifdef JS_USE_LINK_REGISTER
+   masm.freeStack(reserved);
+   // RA is already in its place, just move stack.
++#  ifdef JS_CODEGEN_PPC64
++  // PPC64's LR is not a GPR, so WasmTailCallRAScratchReg is a normal GPR
++  // (r14). We must explicitly move the trampoline address to LR so the
++  // callee returns to the trampoline.
++  masm.xs_mtlr(tempForRA);
++#  endif
+   masm.addToStackPtr(Imm32(framePushedAtStart + newArgDest));
+ #else
+   // Push RA to new frame: store RA, restore temp, and move stack.
+@@ -8527,7 +8539,7 @@ void MacroAssembler::debugAssertCanonicalInt32(Register r) {
+     breakpoint();
+     bind(&ok);
+ #    elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+-        defined(JS_CODEGEN_RISCV64)
++        defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+     Label ok;
+     UseScratchRegisterScope temps(*this);
+     Register scratch = temps.Acquire();
+@@ -10567,6 +10579,15 @@ void MacroAssembler::orderedHashTableLookup(Register setOrMapObj,
+   unboxInt32(Address(setOrMapObj, TableObject::offsetOfLiveCount()), temp1);
+   branchTest32(Assembler::Zero, temp1, temp1, &notFound);
+ 
++#if defined(JS_CODEGEN_PPC64)
++  // If this was preceded by a MoveGroup instruction, the hash may have been
++  // loaded algebraically since it's an Int32 (and thus sign-extended); the
++  // operation doesn't know to keep the upper bits clear, failing the assert.
++  if (isBigInt == IsBigInt::No) {
++    as_rldicl(hash, hash, 0, 32);
++  }
++#endif
++
+ #ifdef DEBUG
+   PushRegsInMask(LiveRegisterSet(RegisterSet::Volatile()));
+ 
+diff --git a/js/src/jit/MacroAssembler.h b/js/src/jit/MacroAssembler.h
+index 6c08bb554ca8..754e8642bb57 100644
+--- a/js/src/jit/MacroAssembler.h
++++ b/js/src/jit/MacroAssembler.h
+@@ -23,6 +23,8 @@
+ #  include "jit/loong64/MacroAssembler-loong64.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ #  include "jit/riscv64/MacroAssembler-riscv64.h"
++#elif defined(JS_CODEGEN_PPC64)
++#  include "jit/ppc64/MacroAssembler-ppc64.h"
+ #elif defined(JS_CODEGEN_WASM32)
+ #  include "jit/wasm32/MacroAssembler-wasm32.h"
+ #elif defined(JS_CODEGEN_NONE)
+@@ -93,8 +95,9 @@
+ //   }
+ //   ////}}} check_macroassembler_style
+ 
+-#define ALL_ARCH mips64, arm, arm64, x86, x64, loong64, riscv64, wasm32
+-#define ALL_SHARED_ARCH arm, arm64, loong64, mips64, riscv64, x86_shared, wasm32
++#define ALL_ARCH mips64, arm, arm64, x86, x64, loong64, riscv64, ppc64, wasm32
++#define ALL_SHARED_ARCH \
++  arm, arm64, loong64, mips64, riscv64, ppc64, x86_shared, wasm32
+ 
+ // * How this macro works:
+ //
+@@ -140,6 +143,7 @@
+ #define DEFINED_ON_mips64
+ #define DEFINED_ON_loong64
+ #define DEFINED_ON_riscv64
++#define DEFINED_ON_ppc64
+ #define DEFINED_ON_wasm32
+ #define DEFINED_ON_none
+ 
+@@ -169,6 +173,9 @@
+ #elif defined(JS_CODEGEN_RISCV64)
+ #  undef DEFINED_ON_riscv64
+ #  define DEFINED_ON_riscv64 define
++#elif defined(JS_CODEGEN_PPC64)
++#  undef DEFINED_ON_ppc64
++#  define DEFINED_ON_ppc64 define
+ #elif defined(JS_CODEGEN_WASM32)
+ #  undef DEFINED_ON_wasm32
+ #  define DEFINED_ON_wasm32 define
+@@ -562,7 +569,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   void Pop(const Register64 reg);
+   void PopFlags() DEFINED_ON(x86_shared);
+   void PopStackPtr()
+-      DEFINED_ON(arm, mips64, x86_shared, loong64, riscv64, wasm32);
++      DEFINED_ON(arm, mips64, x86_shared, loong64, riscv64, ppc64, wasm32);
+ 
+   // Move the stack pointer based on the requested amount.
+   void adjustStack(int amount);
+@@ -620,9 +627,9 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ 
+   // These do not adjust framePushed().
+   void pushReturnAddress()
+-      DEFINED_ON(mips64, arm, arm64, loong64, riscv64, wasm32);
++      DEFINED_ON(mips64, arm, arm64, loong64, riscv64, ppc64, wasm32);
+   void popReturnAddress()
+-      DEFINED_ON(mips64, arm, arm64, loong64, riscv64, wasm32);
++      DEFINED_ON(mips64, arm, arm64, loong64, riscv64, ppc64, wasm32);
+ 
+   // Useful for dealing with two-valued returns.
+   void moveRegPair(Register src0, Register src1, Register dst0, Register dst1,
+@@ -641,7 +648,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   CodeOffset farJumpWithPatch() PER_SHARED_ARCH;
+   void patchFarJump(CodeOffset farJump, uint32_t targetOffset) PER_SHARED_ARCH;
+   static void patchFarJump(uint8_t* farJump, uint8_t* target)
+-      DEFINED_ON(arm, arm64, x86_shared, loong64, mips64, riscv64);
++      DEFINED_ON(arm, arm64, x86_shared, loong64, mips64, riscv64, ppc64);
+ 
+   // Emit a nop that can be patched to and from a nop and a call with int32
+   // relative displacement.
+@@ -667,9 +674,9 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // target behaviour is only provided for `n` in the range 0 .. 2^31-1
+   // inclusive.
+   CodeOffset move32WithPatch(Register dest)
+-      DEFINED_ON(x86_shared, arm, arm64, loong64, mips64, riscv64);
++      DEFINED_ON(x86_shared, arm, arm64, loong64, mips64, riscv64, ppc64);
+   void patchMove32(CodeOffset offset, Imm32 n)
+-      DEFINED_ON(x86_shared, arm, arm64, loong64, mips64, riscv64);
++      DEFINED_ON(x86_shared, arm, arm64, loong64, mips64, riscv64, ppc64);
+ 
+  public:
+   // ===============================================================
+@@ -1174,13 +1181,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   inline void mulPtr(ImmWord rhs, Register srcDest) PER_ARCH;
+ 
+   inline void mul64(const Register64& rhs, const Register64& srcDest)
+-      DEFINED_ON(x64, arm64, mips64, loong64, riscv64);
++      DEFINED_ON(x64, arm64, mips64, loong64, riscv64, ppc64);
+   inline void mul64(const Operand& src, const Register64& dest) DEFINED_ON(x64);
+   inline void mul64(const Operand& src, const Register64& dest,
+                     const Register temp) DEFINED_ON(x64);
+   inline void mul64(Imm64 imm, const Register64& dest) PER_ARCH;
+   inline void mul64(Imm64 imm, const Register64& dest, const Register temp)
+-      DEFINED_ON(x86, x64, arm, mips64, loong64, riscv64);
++      DEFINED_ON(x86, x64, arm, mips64, loong64, riscv64, ppc64);
+   inline void mul64(const Register64& src, const Register64& dest,
+                     const Register temp) PER_ARCH;
+   inline void mul64(const Register64& src1, const Register64& src2,
+@@ -1202,11 +1209,11 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // On ARM, the chip must have hardware division instructions.
+   inline void quotient32(Register lhs, Register rhs, Register dest,
+                          bool isUnsigned)
+-      DEFINED_ON(mips64, arm, arm64, loong64, riscv64, wasm32);
++      DEFINED_ON(mips64, arm, arm64, loong64, riscv64, wasm32, ppc64);
+ 
+   inline void quotient64(Register lhs, Register rhs, Register dest,
+                          bool isUnsigned)
+-      DEFINED_ON(arm64, loong64, mips64, riscv64);
++      DEFINED_ON(arm64, loong64, mips64, riscv64, ppc64);
+ 
+   // As above, but lhs and dest must be eax and tempEdx must be edx.
+   inline void quotient32(Register lhs, Register rhs, Register dest,
+@@ -1219,11 +1226,11 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // On ARM, the chip must have hardware division instructions.
+   inline void remainder32(Register lhs, Register rhs, Register dest,
+                           bool isUnsigned)
+-      DEFINED_ON(mips64, arm, arm64, loong64, riscv64, wasm32);
++      DEFINED_ON(mips64, arm, arm64, loong64, riscv64, wasm32, ppc64);
+ 
+   inline void remainder64(Register lhs, Register rhs, Register dest,
+                           bool isUnsigned)
+-      DEFINED_ON(arm64, loong64, mips64, riscv64);
++      DEFINED_ON(arm64, loong64, mips64, riscv64, ppc64);
+ 
+   // As above, but lhs and dest must be eax and tempEdx must be edx.
+   inline void remainder32(Register lhs, Register rhs, Register dest,
+@@ -2080,7 +2087,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   template <typename T>
+   void branchValueIsNurseryCellImpl(Condition cond, const T& value,
+                                     Register temp, Label* label)
+-      DEFINED_ON(arm64, x64, mips64, loong64, riscv64);
++      DEFINED_ON(arm64, x64, mips64, loong64, riscv64, ppc64);
+ 
+   template <typename T>
+   inline void branchTestUndefinedImpl(Condition cond, const T& t, Label* label)
+@@ -2245,7 +2252,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // from all the other registers, on all supported targets.
+   inline void wasmAddSubI128HI64(Register lhsLo, Register lhsHi, Register rhsLo,
+                                  Register rhsHi, Register output, bool isAdd)
+-      DEFINED_ON(x64, arm64, riscv64, loong64, mips64);
++      DEFINED_ON(x64, arm64, riscv64, loong64, mips64, ppc64);
+ 
+   // Produces the top 64 bits of the 128-bit value `RAX *widen rhs`.  The result
+   // will be in RAX.  RDX is trashed.  `rhs` may not be RAX or RDX.  Callers
+@@ -2256,7 +2263,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // what the registers may be.
+   inline void wasmMulI64WideHI64(Register lhs, Register rhs, Register output,
+                                  bool isSigned)
+-      DEFINED_ON(arm64, riscv64, loong64, mips64);
++      DEFINED_ON(arm64, riscv64, loong64, mips64, ppc64);
+ 
+   // ========================================================================
+   // Canonicalization primitives.
+@@ -2355,68 +2362,68 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // Moves
+ 
+   inline void moveSimd128(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Constants
+ 
+   inline void loadConstantSimd128(const SimdConstant& v, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Splat
+ 
+   inline void splatX16(Register src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void splatX16(uint32_t srcLane, FloatRegister src, FloatRegister dest)
+       DEFINED_ON(arm64);
+ 
+   inline void splatX8(Register src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void splatX8(uint32_t srcLane, FloatRegister src, FloatRegister dest)
+       DEFINED_ON(arm64);
+ 
+   inline void splatX4(Register src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void splatX4(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void splatX2(Register64 src, FloatRegister dest)
+-      DEFINED_ON(x86, x64, arm64);
++      DEFINED_ON(x86, x64, arm64, ppc64);
+ 
+   inline void splatX2(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Extract lane as scalar.  Float extraction does not canonicalize the value.
+ 
+   inline void extractLaneInt8x16(uint32_t lane, FloatRegister src,
+-                                 Register dest) DEFINED_ON(x86_shared, arm64);
++                                 Register dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedExtractLaneInt8x16(uint32_t lane, FloatRegister src,
+                                          Register dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void extractLaneInt16x8(uint32_t lane, FloatRegister src,
+-                                 Register dest) DEFINED_ON(x86_shared, arm64);
++                                 Register dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedExtractLaneInt16x8(uint32_t lane, FloatRegister src,
+                                          Register dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void extractLaneInt32x4(uint32_t lane, FloatRegister src,
+-                                 Register dest) DEFINED_ON(x86_shared, arm64);
++                                 Register dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void extractLaneInt64x2(uint32_t lane, FloatRegister src,
+-                                 Register64 dest) DEFINED_ON(x86, x64, arm64);
++                                 Register64 dest) DEFINED_ON(x86, x64, arm64, ppc64);
+ 
+   inline void extractLaneFloat32x4(uint32_t lane, FloatRegister src,
+                                    FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void extractLaneFloat64x2(uint32_t lane, FloatRegister src,
+                                    FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Replace lane value
+ 
+@@ -2425,21 +2432,21 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ 
+   inline void replaceLaneInt8x16(unsigned lane, Register rhs,
+                                  FloatRegister lhsDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void replaceLaneInt16x8(unsigned lane, FloatRegister lhs, Register rhs,
+                                  FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void replaceLaneInt16x8(unsigned lane, Register rhs,
+                                  FloatRegister lhsDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void replaceLaneInt32x4(unsigned lane, FloatRegister lhs, Register rhs,
+                                  FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void replaceLaneInt32x4(unsigned lane, Register rhs,
+                                  FloatRegister lhsDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void replaceLaneInt64x2(unsigned lane, FloatRegister lhs,
+                                  Register64 rhs, FloatRegister dest)
+@@ -2447,7 +2454,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ 
+   inline void replaceLaneInt64x2(unsigned lane, Register64 rhs,
+                                  FloatRegister lhsDest)
+-      DEFINED_ON(x86, x64, arm64);
++      DEFINED_ON(x86, x64, arm64, ppc64);
+ 
+   inline void replaceLaneFloat32x4(unsigned lane, FloatRegister lhs,
+                                    FloatRegister rhs, FloatRegister dest)
+@@ -2455,7 +2462,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ 
+   inline void replaceLaneFloat32x4(unsigned lane, FloatRegister rhs,
+                                    FloatRegister lhsDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void replaceLaneFloat64x2(unsigned lane, FloatRegister lhs,
+                                    FloatRegister rhs, FloatRegister dest)
+@@ -2463,7 +2470,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ 
+   inline void replaceLaneFloat64x2(unsigned lane, FloatRegister rhs,
+                                    FloatRegister lhsDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Shuffle - blend and permute with immediate indices, and its many
+   // specializations.  Lane values other than those mentioned are illegal.
+@@ -2471,11 +2478,11 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // lane values 0..31
+   inline void shuffleInt8x16(const uint8_t lanes[16], FloatRegister rhs,
+                              FloatRegister lhsDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void shuffleInt8x16(const uint8_t lanes[16], FloatRegister lhs,
+                              FloatRegister rhs, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Lane values must be 0 (select from lhs) or FF (select from rhs).
+   // The behavior is undefined for lane values that are neither 0 nor FF.
+@@ -2502,39 +2509,39 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // The implementation works effectively for I8x16, I16x8, I32x4, and I64x2.
+   inline void laneSelectSimd128(FloatRegister mask, FloatRegister lhs,
+                                 FloatRegister rhs, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void interleaveHighInt8x16(FloatRegister lhs, FloatRegister rhs,
+                                     FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void interleaveHighInt16x8(FloatRegister lhs, FloatRegister rhs,
+                                     FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void interleaveHighInt32x4(FloatRegister lhs, FloatRegister rhs,
+                                     FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void interleaveHighInt64x2(FloatRegister lhs, FloatRegister rhs,
+                                     FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void interleaveLowInt8x16(FloatRegister lhs, FloatRegister rhs,
+                                    FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void interleaveLowInt16x8(FloatRegister lhs, FloatRegister rhs,
+                                    FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void interleaveLowInt32x4(FloatRegister lhs, FloatRegister rhs,
+                                    FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void interleaveLowInt64x2(FloatRegister lhs, FloatRegister rhs,
+                                    FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Permute - permute with immediate indices.
+ 
+@@ -2544,7 +2551,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ 
+   // lane values 0..7
+   inline void permuteInt16x8(const uint16_t lanes[8], FloatRegister src,
+-                             FloatRegister dest) DEFINED_ON(arm64);
++                             FloatRegister dest) DEFINED_ON(arm64, ppc64);
+ 
+   // lane values 0..3 [sic].
+   inline void permuteHighInt16x8(const uint16_t lanes[4], FloatRegister src,
+@@ -2562,80 +2569,80 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   //   low_16_bytes_of((lhs ++ rhs) >> shift*8), shift must be < 16
+   inline void concatAndRightShiftSimd128(FloatRegister lhs, FloatRegister rhs,
+                                          FloatRegister dest, uint32_t shift)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Rotate right by immediate count:
+   //   low_16_bytes_of((src ++ src) >> shift*8), shift must be < 16
+   inline void rotateRightSimd128(FloatRegister src, FloatRegister dest,
+-                                 uint32_t shift) DEFINED_ON(arm64);
++                                 uint32_t shift) DEFINED_ON(arm64, ppc64);
+ 
+   // Shift bytes with immediate count, shifting in zeroes.  Shift count 0..15.
+ 
+   inline void leftShiftSimd128(Imm32 count, FloatRegister src,
+                                FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void rightShiftSimd128(Imm32 count, FloatRegister src,
+                                 FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Zero extend int values.
+ 
+   inline void zeroExtend8x16To16x8(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+   inline void zeroExtend8x16To32x4(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+   inline void zeroExtend8x16To64x2(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+   inline void zeroExtend16x8To32x4(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+   inline void zeroExtend16x8To64x2(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+   inline void zeroExtend32x4To64x2(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Reverse bytes in lanes.
+ 
+   inline void reverseInt16x8(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void reverseInt32x4(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void reverseInt64x2(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Swizzle - permute with variable indices.  `rhs` holds the lanes parameter.
+ 
+   inline void swizzleInt8x16(FloatRegister lhs, FloatRegister rhs,
+-                             FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                             FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void swizzleInt8x16Relaxed(FloatRegister lhs, FloatRegister rhs,
+                                     FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Integer Add
+ 
+   inline void addInt8x16(FloatRegister lhs, FloatRegister rhs,
+-                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void addInt8x16(FloatRegister lhs, const SimdConstant& rhs,
+                          FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void addInt16x8(FloatRegister lhs, FloatRegister rhs,
+-                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void addInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+                          FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void addInt32x4(FloatRegister lhs, FloatRegister rhs,
+-                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void addInt32x4(FloatRegister lhs, const SimdConstant& rhs,
+                          FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void addInt64x2(FloatRegister lhs, FloatRegister rhs,
+-                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void addInt64x2(FloatRegister lhs, const SimdConstant& rhs,
+                          FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -2643,13 +2650,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // Integer Subtract
+ 
+   inline void subInt8x16(FloatRegister lhs, FloatRegister rhs,
+-                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void subInt8x16(FloatRegister lhs, const SimdConstant& rhs,
+                          FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void subInt16x8(FloatRegister lhs, FloatRegister rhs,
+-                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void subInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+                          FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -2658,24 +2665,24 @@ class MacroAssembler : public MacroAssemblerSpecific {
+                          FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void subInt32x4(FloatRegister lhs, FloatRegister rhs,
+-                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void subInt64x2(FloatRegister lhs, const SimdConstant& rhs,
+                          FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void subInt64x2(FloatRegister lhs, FloatRegister rhs,
+-                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Integer Multiply
+ 
+   inline void mulInt16x8(FloatRegister lhs, FloatRegister rhs,
+-                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void mulInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+                          FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void mulInt32x4(FloatRegister lhs, FloatRegister rhs,
+-                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void mulInt32x4(FloatRegister lhs, const SimdConstant& rhs,
+                          FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -2691,100 +2698,100 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ 
+   inline void mulInt64x2(FloatRegister lhs, FloatRegister rhs,
+                          FloatRegister dest, FloatRegister temp1,
+-                         FloatRegister temp2) DEFINED_ON(arm64);
++                         FloatRegister temp2) DEFINED_ON(arm64, ppc64);
+ 
+   // Note for the extMul opcodes, the NxM designation is for the input lanes;
+   // the output lanes are twice as wide.
+   inline void extMulLowInt8x16(FloatRegister lhs, FloatRegister rhs,
+                                FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void extMulHighInt8x16(FloatRegister lhs, FloatRegister rhs,
+                                 FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedExtMulLowInt8x16(FloatRegister lhs, FloatRegister rhs,
+                                        FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedExtMulHighInt8x16(FloatRegister lhs, FloatRegister rhs,
+                                         FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void extMulLowInt16x8(FloatRegister lhs, FloatRegister rhs,
+                                FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void extMulHighInt16x8(FloatRegister lhs, FloatRegister rhs,
+                                 FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedExtMulLowInt16x8(FloatRegister lhs, FloatRegister rhs,
+                                        FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedExtMulHighInt16x8(FloatRegister lhs, FloatRegister rhs,
+                                         FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void extMulLowInt32x4(FloatRegister lhs, FloatRegister rhs,
+                                FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void extMulHighInt32x4(FloatRegister lhs, FloatRegister rhs,
+                                 FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedExtMulLowInt32x4(FloatRegister lhs, FloatRegister rhs,
+                                        FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedExtMulHighInt32x4(FloatRegister lhs, FloatRegister rhs,
+                                         FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void q15MulrSatInt16x8(FloatRegister lhs, FloatRegister rhs,
+                                 FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Integer Negate
+ 
+   inline void negInt8x16(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void negInt16x8(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void negInt32x4(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void negInt64x2(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Saturating integer add
+ 
+   inline void addSatInt8x16(FloatRegister lhs, FloatRegister rhs,
+-                            FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                            FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void addSatInt8x16(FloatRegister lhs, const SimdConstant& rhs,
+                             FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void unsignedAddSatInt8x16(FloatRegister lhs, FloatRegister rhs,
+                                     FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedAddSatInt8x16(FloatRegister lhs, const SimdConstant& rhs,
+                                     FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void addSatInt16x8(FloatRegister lhs, FloatRegister rhs,
+-                            FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                            FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void addSatInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+                             FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void unsignedAddSatInt16x8(FloatRegister lhs, FloatRegister rhs,
+                                     FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedAddSatInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+                                     FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -2792,27 +2799,27 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // Saturating integer subtract
+ 
+   inline void subSatInt8x16(FloatRegister lhs, FloatRegister rhs,
+-                            FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                            FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void subSatInt8x16(FloatRegister lhs, const SimdConstant& rhs,
+                             FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void unsignedSubSatInt8x16(FloatRegister lhs, FloatRegister rhs,
+                                     FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedSubSatInt8x16(FloatRegister lhs, const SimdConstant& rhs,
+                                     FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void subSatInt16x8(FloatRegister lhs, FloatRegister rhs,
+-                            FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                            FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void subSatInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+                             FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void unsignedSubSatInt16x8(FloatRegister lhs, FloatRegister rhs,
+                                     FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedSubSatInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+                                     FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -2820,40 +2827,40 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // Lane-wise integer minimum
+ 
+   inline void minInt8x16(FloatRegister lhs, FloatRegister rhs,
+-                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void minInt8x16(FloatRegister lhs, const SimdConstant& rhs,
+                          FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void unsignedMinInt8x16(FloatRegister lhs, FloatRegister rhs,
+                                  FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedMinInt8x16(FloatRegister lhs, const SimdConstant& rhs,
+                                  FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void minInt16x8(FloatRegister lhs, FloatRegister rhs,
+-                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void minInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+                          FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void unsignedMinInt16x8(FloatRegister lhs, FloatRegister rhs,
+                                  FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedMinInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+                                  FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void minInt32x4(FloatRegister lhs, FloatRegister rhs,
+-                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void minInt32x4(FloatRegister lhs, const SimdConstant& rhs,
+                          FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void unsignedMinInt32x4(FloatRegister lhs, FloatRegister rhs,
+                                  FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedMinInt32x4(FloatRegister lhs, const SimdConstant& rhs,
+                                  FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -2861,40 +2868,40 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // Lane-wise integer maximum
+ 
+   inline void maxInt8x16(FloatRegister lhs, FloatRegister rhs,
+-                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void maxInt8x16(FloatRegister lhs, const SimdConstant& rhs,
+                          FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void unsignedMaxInt8x16(FloatRegister lhs, FloatRegister rhs,
+                                  FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedMaxInt8x16(FloatRegister lhs, const SimdConstant& rhs,
+                                  FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void maxInt16x8(FloatRegister lhs, FloatRegister rhs,
+-                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void maxInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+                          FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void unsignedMaxInt16x8(FloatRegister lhs, FloatRegister rhs,
+                                  FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedMaxInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+                                  FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void maxInt32x4(FloatRegister lhs, FloatRegister rhs,
+-                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void maxInt32x4(FloatRegister lhs, const SimdConstant& rhs,
+                          FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void unsignedMaxInt32x4(FloatRegister lhs, FloatRegister rhs,
+                                  FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedMaxInt32x4(FloatRegister lhs, const SimdConstant& rhs,
+                                  FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -2903,25 +2910,25 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ 
+   inline void unsignedAverageInt8x16(FloatRegister lhs, FloatRegister rhs,
+                                      FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedAverageInt16x8(FloatRegister lhs, FloatRegister rhs,
+                                      FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Lane-wise integer absolute value
+ 
+   inline void absInt8x16(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void absInt16x8(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void absInt32x4(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void absInt64x2(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Left shift by scalar. Immediates and variable shifts must have been
+   // masked; shifts of zero will work but may or may not generate code.
+@@ -2930,41 +2937,41 @@ class MacroAssembler : public MacroAssemblerSpecific {
+                                FloatRegister temp) DEFINED_ON(x86_shared);
+ 
+   inline void leftShiftInt8x16(FloatRegister lhs, Register rhs,
+-                               FloatRegister dest) DEFINED_ON(arm64);
++                               FloatRegister dest) DEFINED_ON(arm64, ppc64);
+ 
+   inline void leftShiftInt8x16(Imm32 count, FloatRegister src,
+                                FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void leftShiftInt16x8(Register rhs, FloatRegister lhsDest)
+       DEFINED_ON(x86_shared);
+ 
+   inline void leftShiftInt16x8(FloatRegister lhs, Register rhs,
+-                               FloatRegister dest) DEFINED_ON(arm64);
++                               FloatRegister dest) DEFINED_ON(arm64, ppc64);
+ 
+   inline void leftShiftInt16x8(Imm32 count, FloatRegister src,
+                                FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void leftShiftInt32x4(Register rhs, FloatRegister lhsDest)
+       DEFINED_ON(x86_shared);
+ 
+   inline void leftShiftInt32x4(FloatRegister lhs, Register rhs,
+-                               FloatRegister dest) DEFINED_ON(arm64);
++                               FloatRegister dest) DEFINED_ON(arm64, ppc64);
+ 
+   inline void leftShiftInt32x4(Imm32 count, FloatRegister src,
+                                FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void leftShiftInt64x2(Register rhs, FloatRegister lhsDest)
+       DEFINED_ON(x86_shared);
+ 
+   inline void leftShiftInt64x2(FloatRegister lhs, Register rhs,
+-                               FloatRegister dest) DEFINED_ON(arm64);
++                               FloatRegister dest) DEFINED_ON(arm64, ppc64);
+ 
+   inline void leftShiftInt64x2(Imm32 count, FloatRegister src,
+                                FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Right shift by scalar. Immediates and variable shifts must have been
+   // masked; shifts of zero will work but may or may not generate code.
+@@ -2973,82 +2980,82 @@ class MacroAssembler : public MacroAssemblerSpecific {
+                                 FloatRegister temp) DEFINED_ON(x86_shared);
+ 
+   inline void rightShiftInt8x16(FloatRegister lhs, Register rhs,
+-                                FloatRegister dest) DEFINED_ON(arm64);
++                                FloatRegister dest) DEFINED_ON(arm64, ppc64);
+ 
+   inline void rightShiftInt8x16(Imm32 count, FloatRegister src,
+                                 FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedRightShiftInt8x16(Register rhs, FloatRegister lhsDest,
+                                         FloatRegister temp)
+       DEFINED_ON(x86_shared);
+ 
+   inline void unsignedRightShiftInt8x16(FloatRegister lhs, Register rhs,
+-                                        FloatRegister dest) DEFINED_ON(arm64);
++                                        FloatRegister dest) DEFINED_ON(arm64, ppc64);
+ 
+   inline void unsignedRightShiftInt8x16(Imm32 count, FloatRegister src,
+                                         FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void rightShiftInt16x8(Register rhs, FloatRegister lhsDest)
+       DEFINED_ON(x86_shared);
+ 
+   inline void rightShiftInt16x8(FloatRegister lhs, Register rhs,
+-                                FloatRegister dest) DEFINED_ON(arm64);
++                                FloatRegister dest) DEFINED_ON(arm64, ppc64);
+ 
+   inline void rightShiftInt16x8(Imm32 count, FloatRegister src,
+                                 FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedRightShiftInt16x8(Register rhs, FloatRegister lhsDest)
+       DEFINED_ON(x86_shared);
+ 
+   inline void unsignedRightShiftInt16x8(FloatRegister lhs, Register rhs,
+-                                        FloatRegister dest) DEFINED_ON(arm64);
++                                        FloatRegister dest) DEFINED_ON(arm64, ppc64);
+ 
+   inline void unsignedRightShiftInt16x8(Imm32 count, FloatRegister src,
+                                         FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void rightShiftInt32x4(Register rhs, FloatRegister lhsDest)
+       DEFINED_ON(x86_shared);
+ 
+   inline void rightShiftInt32x4(FloatRegister lhs, Register rhs,
+-                                FloatRegister dest) DEFINED_ON(arm64);
++                                FloatRegister dest) DEFINED_ON(arm64, ppc64);
+ 
+   inline void rightShiftInt32x4(Imm32 count, FloatRegister src,
+                                 FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedRightShiftInt32x4(Register rhs, FloatRegister lhsDest)
+       DEFINED_ON(x86_shared);
+ 
+   inline void unsignedRightShiftInt32x4(FloatRegister lhs, Register rhs,
+-                                        FloatRegister dest) DEFINED_ON(arm64);
++                                        FloatRegister dest) DEFINED_ON(arm64, ppc64);
+ 
+   inline void unsignedRightShiftInt32x4(Imm32 count, FloatRegister src,
+                                         FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void rightShiftInt64x2(Register rhs, FloatRegister lhsDest,
+                                 FloatRegister temp) DEFINED_ON(x86_shared);
+ 
+   inline void rightShiftInt64x2(Imm32 count, FloatRegister src,
+                                 FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void rightShiftInt64x2(FloatRegister lhs, Register rhs,
+-                                FloatRegister dest) DEFINED_ON(arm64);
++                                FloatRegister dest) DEFINED_ON(arm64, ppc64);
+ 
+   inline void unsignedRightShiftInt64x2(Register rhs, FloatRegister lhsDest)
+       DEFINED_ON(x86_shared);
+ 
+   inline void unsignedRightShiftInt64x2(FloatRegister lhs, Register rhs,
+-                                        FloatRegister dest) DEFINED_ON(arm64);
++                                        FloatRegister dest) DEFINED_ON(arm64, ppc64);
+ 
+   inline void unsignedRightShiftInt64x2(Imm32 count, FloatRegister src,
+                                         FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Sign replication operation
+ 
+@@ -3067,47 +3074,47 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // Bitwise and, or, xor, not
+ 
+   inline void bitwiseAndSimd128(FloatRegister rhs, FloatRegister lhsDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void bitwiseAndSimd128(FloatRegister lhs, FloatRegister rhs,
+                                 FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void bitwiseAndSimd128(FloatRegister lhs, const SimdConstant& rhs,
+                                 FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void bitwiseOrSimd128(FloatRegister rhs, FloatRegister lhsDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void bitwiseOrSimd128(FloatRegister lhs, FloatRegister rhs,
+                                FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void bitwiseOrSimd128(FloatRegister lhs, const SimdConstant& rhs,
+                                FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void bitwiseXorSimd128(FloatRegister rhs, FloatRegister lhsDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void bitwiseXorSimd128(FloatRegister lhs, FloatRegister rhs,
+                                 FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void bitwiseXorSimd128(FloatRegister lhs, const SimdConstant& rhs,
+                                 FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void bitwiseNotSimd128(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Bitwise AND with compliment: dest = lhs & ~rhs, note only arm64 can do it.
+   inline void bitwiseAndNotSimd128(FloatRegister lhs, FloatRegister rhs,
+-                                   FloatRegister lhsDest) DEFINED_ON(arm64);
++                                   FloatRegister lhsDest) DEFINED_ON(arm64, ppc64);
+ 
+   // Bitwise AND with complement: dest = ~lhs & rhs, note this is not what Wasm
+   // wants but what the x86 hardware offers.  Hence the name.
+ 
+   inline void bitwiseNotAndSimd128(FloatRegister rhs, FloatRegister lhsDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void bitwiseNotAndSimd128(FloatRegister lhs, FloatRegister rhs,
+                                    FloatRegister lhsDest)
+@@ -3120,34 +3127,34 @@ class MacroAssembler : public MacroAssemblerSpecific {
+                                    FloatRegister temp) DEFINED_ON(x86_shared);
+ 
+   inline void bitwiseSelectSimd128(FloatRegister onTrue, FloatRegister onFalse,
+-                                   FloatRegister maskDest) DEFINED_ON(arm64);
++                                   FloatRegister maskDest) DEFINED_ON(arm64, ppc64);
+ 
+   // Population count
+ 
+   inline void popcntInt8x16(FloatRegister src, FloatRegister dest,
+-                            FloatRegister temp) DEFINED_ON(x86_shared);
++                            FloatRegister temp) DEFINED_ON(x86_shared, ppc64);
+ 
+   inline void popcntInt8x16(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(arm64);
++      DEFINED_ON(arm64, ppc64);
+ 
+   // Any lane true, ie, any bit set
+ 
+   inline void anyTrueSimd128(FloatRegister src, Register dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // All lanes true
+ 
+   inline void allTrueInt8x16(FloatRegister src, Register dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void allTrueInt16x8(FloatRegister src, Register dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void allTrueInt32x4(FloatRegister src, Register dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void allTrueInt64x2(FloatRegister src, Register dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Bitmask, ie extract and compress high bits of all lanes
+ 
+@@ -3155,31 +3162,31 @@ class MacroAssembler : public MacroAssemblerSpecific {
+       DEFINED_ON(x86_shared);
+ 
+   inline void bitmaskInt8x16(FloatRegister src, Register dest,
+-                             FloatRegister temp) DEFINED_ON(arm64);
++                             FloatRegister temp) DEFINED_ON(arm64, ppc64);
+ 
+   inline void bitmaskInt16x8(FloatRegister src, Register dest)
+       DEFINED_ON(x86_shared);
+ 
+   inline void bitmaskInt16x8(FloatRegister src, Register dest,
+-                             FloatRegister temp) DEFINED_ON(arm64);
++                             FloatRegister temp) DEFINED_ON(arm64, ppc64);
+ 
+   inline void bitmaskInt32x4(FloatRegister src, Register dest)
+       DEFINED_ON(x86_shared);
+ 
+   inline void bitmaskInt32x4(FloatRegister src, Register dest,
+-                             FloatRegister temp) DEFINED_ON(arm64);
++                             FloatRegister temp) DEFINED_ON(arm64, ppc64);
+ 
+   inline void bitmaskInt64x2(FloatRegister src, Register dest)
+       DEFINED_ON(x86_shared);
+ 
+   inline void bitmaskInt64x2(FloatRegister src, Register dest,
+-                             FloatRegister temp) DEFINED_ON(arm64);
++                             FloatRegister temp) DEFINED_ON(arm64, ppc64);
+ 
+   // Comparisons (integer and floating-point)
+ 
+   inline void compareInt8x16(Assembler::Condition cond, FloatRegister rhs,
+                              FloatRegister lhsDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // On x86_shared, limited to !=, ==, <=, >
+   inline void compareInt8x16(Assembler::Condition cond, FloatRegister lhs,
+@@ -3189,15 +3196,15 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // On arm64, use any integer comparison condition.
+   inline void compareInt8x16(Assembler::Condition cond, FloatRegister lhs,
+                              FloatRegister rhs, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void compareInt16x8(Assembler::Condition cond, FloatRegister rhs,
+                              FloatRegister lhsDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void compareInt16x8(Assembler::Condition cond, FloatRegister lhs,
+                              FloatRegister rhs, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // On x86_shared, limited to !=, ==, <=, >
+   inline void compareInt16x8(Assembler::Condition cond, FloatRegister lhs,
+@@ -3207,7 +3214,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // On x86_shared, limited to !=, ==, <=, >
+   inline void compareInt32x4(Assembler::Condition cond, FloatRegister rhs,
+                              FloatRegister lhsDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void compareInt32x4(Assembler::Condition cond, FloatRegister lhs,
+                              const SimdConstant& rhs, FloatRegister dest)
+@@ -3216,7 +3223,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // On arm64, use any integer comparison condition.
+   inline void compareInt32x4(Assembler::Condition cond, FloatRegister lhs,
+                              FloatRegister rhs, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void compareForEqualityInt64x2(Assembler::Condition cond,
+                                         FloatRegister lhs, FloatRegister rhs,
+@@ -3230,15 +3237,15 @@ class MacroAssembler : public MacroAssemblerSpecific {
+       DEFINED_ON(x86_shared);
+ 
+   inline void compareInt64x2(Assembler::Condition cond, FloatRegister rhs,
+-                             FloatRegister lhsDest) DEFINED_ON(arm64);
++                             FloatRegister lhsDest) DEFINED_ON(arm64, ppc64);
+ 
+   inline void compareInt64x2(Assembler::Condition cond, FloatRegister lhs,
+                              FloatRegister rhs, FloatRegister dest)
+-      DEFINED_ON(arm64);
++      DEFINED_ON(arm64, ppc64);
+ 
+   inline void compareFloat32x4(Assembler::Condition cond, FloatRegister rhs,
+                                FloatRegister lhsDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // On x86_shared, limited to ==, !=, <, <=
+   inline void compareFloat32x4(Assembler::Condition cond, FloatRegister lhs,
+@@ -3249,11 +3256,11 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // On arm64, use any float-point comparison condition.
+   inline void compareFloat32x4(Assembler::Condition cond, FloatRegister lhs,
+                                FloatRegister rhs, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void compareFloat64x2(Assembler::Condition cond, FloatRegister rhs,
+                                FloatRegister lhsDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // On x86_shared, limited to ==, !=, <, <=
+   inline void compareFloat64x2(Assembler::Condition cond, FloatRegister lhs,
+@@ -3264,7 +3271,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // On arm64, use any float-point comparison condition.
+   inline void compareFloat64x2(Assembler::Condition cond, FloatRegister lhs,
+                                FloatRegister rhs, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Load
+ 
+@@ -3273,92 +3280,92 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ 
+   inline FaultingCodeOffset loadUnalignedSimd128(const Address& src,
+                                                  FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline FaultingCodeOffset loadUnalignedSimd128(const BaseIndex& src,
+                                                  FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Store
+ 
+   inline FaultingCodeOffset storeUnalignedSimd128(FloatRegister src,
+                                                   const Address& dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline FaultingCodeOffset storeUnalignedSimd128(FloatRegister src,
+                                                   const BaseIndex& dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Floating point negation
+ 
+   inline void negFloat32x4(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void negFloat64x2(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Floating point absolute value
+ 
+   inline void absFloat32x4(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void absFloat64x2(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // NaN-propagating minimum
+ 
+   inline void minFloat32x4(FloatRegister lhs, FloatRegister rhs,
+                            FloatRegister dest, FloatRegister temp1,
+-                           FloatRegister temp2) DEFINED_ON(x86_shared);
++                           FloatRegister temp2) DEFINED_ON(x86_shared, ppc64);
+ 
+   inline void minFloat32x4(FloatRegister rhs, FloatRegister lhsDest)
+-      DEFINED_ON(arm64);
++      DEFINED_ON(arm64, ppc64);
+ 
+   inline void minFloat32x4(FloatRegister lhs, FloatRegister rhs,
+-                           FloatRegister dest) DEFINED_ON(arm64);
++                           FloatRegister dest) DEFINED_ON(arm64, ppc64);
+ 
+   inline void minFloat64x2(FloatRegister lhs, FloatRegister rhs,
+                            FloatRegister dest, FloatRegister temp1,
+-                           FloatRegister temp2) DEFINED_ON(x86_shared);
++                           FloatRegister temp2) DEFINED_ON(x86_shared, ppc64);
+ 
+   inline void minFloat64x2(FloatRegister rhs, FloatRegister lhsDest)
+-      DEFINED_ON(arm64);
++      DEFINED_ON(arm64, ppc64);
+ 
+   inline void minFloat64x2(FloatRegister lhs, FloatRegister rhs,
+-                           FloatRegister dest) DEFINED_ON(arm64);
++                           FloatRegister dest) DEFINED_ON(arm64, ppc64);
+ 
+   // NaN-propagating maximum
+ 
+   inline void maxFloat32x4(FloatRegister lhs, FloatRegister rhs,
+                            FloatRegister dest, FloatRegister temp1,
+-                           FloatRegister temp2) DEFINED_ON(x86_shared);
++                           FloatRegister temp2) DEFINED_ON(x86_shared, ppc64);
+ 
+   inline void maxFloat32x4(FloatRegister rhs, FloatRegister lhsDest)
+-      DEFINED_ON(arm64);
++      DEFINED_ON(arm64, ppc64);
+ 
+   inline void maxFloat32x4(FloatRegister lhs, FloatRegister rhs,
+-                           FloatRegister dest) DEFINED_ON(arm64);
++                           FloatRegister dest) DEFINED_ON(arm64, ppc64);
+ 
+   inline void maxFloat64x2(FloatRegister lhs, FloatRegister rhs,
+                            FloatRegister dest, FloatRegister temp1,
+-                           FloatRegister temp2) DEFINED_ON(x86_shared);
++                           FloatRegister temp2) DEFINED_ON(x86_shared, ppc64);
+ 
+   inline void maxFloat64x2(FloatRegister rhs, FloatRegister lhsDest)
+-      DEFINED_ON(arm64);
++      DEFINED_ON(arm64, ppc64);
+ 
+   inline void maxFloat64x2(FloatRegister lhs, FloatRegister rhs,
+-                           FloatRegister dest) DEFINED_ON(arm64);
++                           FloatRegister dest) DEFINED_ON(arm64, ppc64);
+ 
+   // Floating add
+ 
+   inline void addFloat32x4(FloatRegister lhs, FloatRegister rhs,
+-                           FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                           FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void addFloat32x4(FloatRegister lhs, const SimdConstant& rhs,
+                            FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void addFloat64x2(FloatRegister lhs, FloatRegister rhs,
+-                           FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                           FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void addFloat64x2(FloatRegister lhs, const SimdConstant& rhs,
+                            FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -3366,13 +3373,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // Floating subtract
+ 
+   inline void subFloat32x4(FloatRegister lhs, FloatRegister rhs,
+-                           FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                           FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void subFloat32x4(FloatRegister lhs, const SimdConstant& rhs,
+                            FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void subFloat64x2(FloatRegister lhs, FloatRegister rhs,
+-                           FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                           FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void subFloat64x2(FloatRegister lhs, const SimdConstant& rhs,
+                            FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -3380,13 +3387,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // Floating division
+ 
+   inline void divFloat32x4(FloatRegister lhs, FloatRegister rhs,
+-                           FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                           FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void divFloat32x4(FloatRegister lhs, const SimdConstant& rhs,
+                            FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void divFloat64x2(FloatRegister lhs, FloatRegister rhs,
+-                           FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                           FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void divFloat64x2(FloatRegister lhs, const SimdConstant& rhs,
+                            FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -3394,13 +3401,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // Floating Multiply
+ 
+   inline void mulFloat32x4(FloatRegister lhs, FloatRegister rhs,
+-                           FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                           FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void mulFloat32x4(FloatRegister lhs, const SimdConstant& rhs,
+                            FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void mulFloat64x2(FloatRegister lhs, FloatRegister rhs,
+-                           FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                           FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void mulFloat64x2(FloatRegister lhs, const SimdConstant& rhs,
+                            FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -3408,91 +3415,91 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // Pairwise add
+ 
+   inline void extAddPairwiseInt8x16(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedExtAddPairwiseInt8x16(FloatRegister src,
+                                             FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void extAddPairwiseInt16x8(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedExtAddPairwiseInt16x8(FloatRegister src,
+                                             FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Floating square root
+ 
+   inline void sqrtFloat32x4(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void sqrtFloat64x2(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Integer to floating point with rounding
+ 
+   inline void convertInt32x4ToFloat32x4(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedConvertInt32x4ToFloat32x4(FloatRegister src,
+                                                 FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void convertInt32x4ToFloat64x2(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedConvertInt32x4ToFloat64x2(FloatRegister src,
+                                                 FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Floating point to integer with saturation
+ 
+   inline void truncSatFloat32x4ToInt32x4(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedTruncSatFloat32x4ToInt32x4(FloatRegister src,
+                                                  FloatRegister dest,
+                                                  FloatRegister temp)
+-      DEFINED_ON(x86_shared);
++      DEFINED_ON(x86_shared, ppc64);
+ 
+   inline void unsignedTruncSatFloat32x4ToInt32x4(FloatRegister src,
+                                                  FloatRegister dest)
+-      DEFINED_ON(arm64);
++      DEFINED_ON(arm64, ppc64);
+ 
+   inline void truncSatFloat64x2ToInt32x4(FloatRegister src, FloatRegister dest,
+                                          FloatRegister temp)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedTruncSatFloat64x2ToInt32x4(FloatRegister src,
+                                                  FloatRegister dest,
+                                                  FloatRegister temp)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void truncFloat32x4ToInt32x4Relaxed(FloatRegister src,
+                                              FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedTruncFloat32x4ToInt32x4Relaxed(FloatRegister src,
+                                                      FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void truncFloat64x2ToInt32x4Relaxed(FloatRegister src,
+                                              FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedTruncFloat64x2ToInt32x4Relaxed(FloatRegister src,
+                                                      FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Floating point narrowing
+ 
+   inline void convertFloat64x2ToFloat32x4(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Floating point widening
+ 
+   inline void convertFloat32x4ToFloat64x2(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Integer to integer narrowing
+ 
+@@ -3500,65 +3507,65 @@ class MacroAssembler : public MacroAssemblerSpecific {
+                             FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void narrowInt16x8(FloatRegister lhs, FloatRegister rhs,
+-                            FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                            FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedNarrowInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+                                     FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void unsignedNarrowInt16x8(FloatRegister lhs, FloatRegister rhs,
+                                     FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void narrowInt32x4(FloatRegister lhs, const SimdConstant& rhs,
+                             FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void narrowInt32x4(FloatRegister lhs, FloatRegister rhs,
+-                            FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                            FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedNarrowInt32x4(FloatRegister lhs, const SimdConstant& rhs,
+                                     FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void unsignedNarrowInt32x4(FloatRegister lhs, FloatRegister rhs,
+                                     FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Integer to integer widening
+ 
+   inline void widenLowInt8x16(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void widenHighInt8x16(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedWidenLowInt8x16(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedWidenHighInt8x16(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void widenLowInt16x8(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void widenHighInt16x8(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedWidenLowInt16x8(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedWidenHighInt16x8(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void widenLowInt32x4(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedWidenLowInt32x4(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void widenHighInt32x4(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void unsignedWidenHighInt32x4(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Compare-based minimum/maximum
+   //
+@@ -3570,47 +3577,47 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ 
+   inline void pseudoMinFloat32x4(FloatRegister rhsOrRhsDest,
+                                  FloatRegister lhsOrLhsDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void pseudoMinFloat32x4(FloatRegister lhs, FloatRegister rhs,
+                                  FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void pseudoMinFloat64x2(FloatRegister rhsOrRhsDest,
+                                  FloatRegister lhsOrLhsDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void pseudoMinFloat64x2(FloatRegister lhs, FloatRegister rhs,
+                                  FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void pseudoMaxFloat32x4(FloatRegister rhsOrRhsDest,
+                                  FloatRegister lhsOrLhsDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void pseudoMaxFloat32x4(FloatRegister lhs, FloatRegister rhs,
+                                  FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void pseudoMaxFloat64x2(FloatRegister rhsOrRhsDest,
+                                  FloatRegister lhsOrLhsDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void pseudoMaxFloat64x2(FloatRegister lhs, FloatRegister rhs,
+                                  FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Widening/pairwise integer dot product
+ 
+   inline void widenDotInt16x8(FloatRegister lhs, FloatRegister rhs,
+-                              FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++                              FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void widenDotInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+                               FloatRegister dest) DEFINED_ON(x86_shared);
+ 
+   inline void dotInt8x16Int7x16(FloatRegister lhs, FloatRegister rhs,
+                                 FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void dotInt8x16Int7x16ThenAdd(FloatRegister lhs, FloatRegister rhs,
+                                        FloatRegister dest)
+@@ -3618,81 +3625,81 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ 
+   inline void dotInt8x16Int7x16ThenAdd(FloatRegister lhs, FloatRegister rhs,
+                                        FloatRegister dest, FloatRegister temp)
+-      DEFINED_ON(arm64);
++      DEFINED_ON(arm64, ppc64);
+ 
+   // Floating point rounding
+ 
+   inline void ceilFloat32x4(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void ceilFloat64x2(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void floorFloat32x4(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void floorFloat64x2(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void truncFloat32x4(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void truncFloat64x2(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void nearestFloat32x4(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void nearestFloat64x2(FloatRegister src, FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   // Floating multiply-accumulate: srcDest [+-]= src1 * src2
+ 
+   inline void fmaFloat32x4(FloatRegister src1, FloatRegister src2,
+-                           FloatRegister srcDest) DEFINED_ON(x86_shared, arm64);
++                           FloatRegister srcDest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void fnmaFloat32x4(FloatRegister src1, FloatRegister src2,
+                             FloatRegister srcDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void fmaFloat64x2(FloatRegister src1, FloatRegister src2,
+-                           FloatRegister srcDest) DEFINED_ON(x86_shared, arm64);
++                           FloatRegister srcDest) DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void fnmaFloat64x2(FloatRegister src1, FloatRegister src2,
+                             FloatRegister srcDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void minFloat32x4Relaxed(FloatRegister src, FloatRegister srcDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void minFloat32x4Relaxed(FloatRegister lhs, FloatRegister rhs,
+                                   FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void maxFloat32x4Relaxed(FloatRegister src, FloatRegister srcDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void maxFloat32x4Relaxed(FloatRegister lhs, FloatRegister rhs,
+                                   FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void minFloat64x2Relaxed(FloatRegister src, FloatRegister srcDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void minFloat64x2Relaxed(FloatRegister lhs, FloatRegister rhs,
+                                   FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void maxFloat64x2Relaxed(FloatRegister src, FloatRegister srcDest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void maxFloat64x2Relaxed(FloatRegister lhs, FloatRegister rhs,
+                                   FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+   inline void q15MulrInt16x8Relaxed(FloatRegister lhs, FloatRegister rhs,
+                                     FloatRegister dest)
+-      DEFINED_ON(x86_shared, arm64);
++      DEFINED_ON(x86_shared, arm64, ppc64);
+ 
+  public:
+   // ========================================================================
+@@ -3717,10 +3724,10 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ 
+   // temp required on x86 and x64; must be undefined on mips64 and loong64.
+   void convertUInt64ToFloat32(Register64 src, FloatRegister dest, Register temp)
+-      DEFINED_ON(arm64, mips64, loong64, riscv64, wasm32, x64, x86);
++      DEFINED_ON(arm64, mips64, loong64, ppc64, riscv64, wasm32, x64, x86);
+ 
+   void convertInt64ToFloat32(Register64 src, FloatRegister dest)
+-      DEFINED_ON(arm64, mips64, loong64, riscv64, wasm32, x64, x86);
++      DEFINED_ON(arm64, mips64, loong64, ppc64, riscv64, wasm32, x64, x86);
+ 
+   bool convertUInt64ToDoubleNeedsTemp() PER_ARCH;
+ 
+@@ -3801,16 +3808,16 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // Scalar::Int64.
+   void wasmLoad(const wasm::MemoryAccessDesc& access, Register memoryBase,
+                 Register ptr, Register ptrScratch, AnyRegister output)
+-      DEFINED_ON(arm, loong64, riscv64, mips64);
++      DEFINED_ON(arm, loong64, riscv64, mips64, ppc64);
+   void wasmLoadI64(const wasm::MemoryAccessDesc& access, Register memoryBase,
+                    Register ptr, Register ptrScratch, Register64 output)
+-      DEFINED_ON(arm, mips64, loong64, riscv64);
++      DEFINED_ON(arm, mips64, loong64, riscv64, ppc64);
+   void wasmStore(const wasm::MemoryAccessDesc& access, AnyRegister value,
+                  Register memoryBase, Register ptr, Register ptrScratch)
+-      DEFINED_ON(arm, loong64, riscv64, mips64);
++      DEFINED_ON(arm, loong64, riscv64, mips64, ppc64);
+   void wasmStoreI64(const wasm::MemoryAccessDesc& access, Register64 value,
+                     Register memoryBase, Register ptr, Register ptrScratch)
+-      DEFINED_ON(arm, mips64, loong64, riscv64);
++      DEFINED_ON(arm, mips64, loong64, riscv64, ppc64);
+ 
+   // These accept general memoryBase + ptr + offset (in `access`); the offset is
+   // always smaller than the guard region.  They will insert an additional add
+@@ -3889,11 +3896,11 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   void wasmTruncateDoubleToInt64(FloatRegister input, Register64 output,
+                                  bool isSaturating, Label* oolEntry,
+                                  Label* oolRejoin, FloatRegister tempDouble)
+-      DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32);
++      DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32, ppc64);
+   void wasmTruncateDoubleToUInt64(FloatRegister input, Register64 output,
+                                   bool isSaturating, Label* oolEntry,
+                                   Label* oolRejoin, FloatRegister tempDouble)
+-      DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32);
++      DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32, ppc64);
+   void oolWasmTruncateCheckF64ToI64(FloatRegister input, Register64 output,
+                                     TruncFlags flags,
+                                     const wasm::TrapSiteDesc& trapSiteDesc,
+@@ -3902,11 +3909,11 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   void wasmTruncateFloat32ToInt64(FloatRegister input, Register64 output,
+                                   bool isSaturating, Label* oolEntry,
+                                   Label* oolRejoin, FloatRegister tempDouble)
+-      DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32);
++      DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32, ppc64);
+   void wasmTruncateFloat32ToUInt64(FloatRegister input, Register64 output,
+                                    bool isSaturating, Label* oolEntry,
+                                    Label* oolRejoin, FloatRegister tempDouble)
+-      DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32);
++      DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32, ppc64);
+   void oolWasmTruncateCheckF32ToI64(FloatRegister input, Register64 output,
+                                     TruncFlags flags,
+                                     const wasm::TrapSiteDesc& trapSiteDesc,
+@@ -4220,7 +4227,8 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   // convention, which requires predictable high bits.  In practice, this means
+   // that the 32-bit value will be zero-extended or sign-extended to 64 bits as
+   // appropriate for the platform.
+-  void widenInt32(Register r) DEFINED_ON(arm64, x64, mips64, loong64, riscv64);
++  void widenInt32(Register r)
++      DEFINED_ON(arm64, x64, mips64, loong64, riscv64, ppc64);
+ 
+   // As enterFakeExitFrame(), but using register conventions appropriate for
+   // wasm stubs.
+@@ -4287,13 +4295,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
+                        const Address& mem, Register expected,
+                        Register replacement, Register valueTemp,
+                        Register offsetTemp, Register maskTemp, Register output)
+-      DEFINED_ON(mips64, loong64, riscv64);
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   void compareExchange(Scalar::Type type, Synchronization sync,
+                        const BaseIndex& mem, Register expected,
+                        Register replacement, Register valueTemp,
+                        Register offsetTemp, Register maskTemp, Register output)
+-      DEFINED_ON(mips64, loong64, riscv64);
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   // x86: `expected` and `output` must be edx:eax; `replacement` is ecx:ebx.
+   // x64: `output` must be rax.
+@@ -4303,12 +4311,12 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   void compareExchange64(Synchronization sync, const Address& mem,
+                          Register64 expected, Register64 replacement,
+                          Register64 output)
+-      DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64);
++      DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64, ppc64);
+ 
+   void compareExchange64(Synchronization sync, const BaseIndex& mem,
+                          Register64 expected, Register64 replacement,
+                          Register64 output)
+-      DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64);
++      DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64, ppc64);
+ 
+   // Exchange with memory.  Return the value initially in memory.
+   // MIPS: `valueTemp`, `offsetTemp` and `maskTemp` must be defined for 8-bit
+@@ -4325,12 +4333,12 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   void atomicExchange(Scalar::Type type, Synchronization sync,
+                       const Address& mem, Register value, Register valueTemp,
+                       Register offsetTemp, Register maskTemp, Register output)
+-      DEFINED_ON(mips64, loong64, riscv64);
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   void atomicExchange(Scalar::Type type, Synchronization sync,
+                       const BaseIndex& mem, Register value, Register valueTemp,
+                       Register offsetTemp, Register maskTemp, Register output)
+-      DEFINED_ON(mips64, loong64, riscv64);
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   // x86: `value` must be ecx:ebx; `output` must be edx:eax.
+   // ARM: `value` and `output` must be distinct and (even,odd) pairs.
+@@ -4338,11 +4346,11 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ 
+   void atomicExchange64(Synchronization sync, const Address& mem,
+                         Register64 value, Register64 output)
+-      DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64);
++      DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64, ppc64);
+ 
+   void atomicExchange64(Synchronization sync, const BaseIndex& mem,
+                         Register64 value, Register64 output)
+-      DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64);
++      DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64, ppc64);
+ 
+   // Read-modify-write with memory.  Return the value in memory before the
+   // operation.
+@@ -4376,12 +4384,12 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   void atomicFetchOp(Scalar::Type type, Synchronization sync, AtomicOp op,
+                      Register value, const Address& mem, Register valueTemp,
+                      Register offsetTemp, Register maskTemp, Register output)
+-      DEFINED_ON(mips64, loong64, riscv64);
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   void atomicFetchOp(Scalar::Type type, Synchronization sync, AtomicOp op,
+                      Register value, const BaseIndex& mem, Register valueTemp,
+                      Register offsetTemp, Register maskTemp, Register output)
+-      DEFINED_ON(mips64, loong64, riscv64);
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   // x86:
+   //   `temp` must be ecx:ebx; `output` must be edx:eax.
+@@ -4395,7 +4403,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ 
+   void atomicFetchOp64(Synchronization sync, AtomicOp op, Register64 value,
+                        const Address& mem, Register64 temp, Register64 output)
+-      DEFINED_ON(arm, arm64, x64, mips64, loong64, riscv64);
++      DEFINED_ON(arm, arm64, x64, mips64, loong64, riscv64, ppc64);
+ 
+   void atomicFetchOp64(Synchronization sync, AtomicOp op, const Address& value,
+                        const Address& mem, Register64 temp, Register64 output)
+@@ -4403,7 +4411,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ 
+   void atomicFetchOp64(Synchronization sync, AtomicOp op, Register64 value,
+                        const BaseIndex& mem, Register64 temp, Register64 output)
+-      DEFINED_ON(arm, arm64, x64, mips64, loong64, riscv64);
++      DEFINED_ON(arm, arm64, x64, mips64, loong64, riscv64, ppc64);
+ 
+   void atomicFetchOp64(Synchronization sync, AtomicOp op, const Address& value,
+                        const BaseIndex& mem, Register64 temp, Register64 output)
+@@ -4421,14 +4429,14 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ 
+   void atomicEffectOp64(Synchronization sync, AtomicOp op, Register64 value,
+                         const Address& mem, Register64 temp)
+-      DEFINED_ON(arm, arm64, mips64, loong64, riscv64);
++      DEFINED_ON(arm, arm64, mips64, loong64, riscv64, ppc64);
+ 
+   void atomicEffectOp64(Synchronization sync, AtomicOp op, Register64 value,
+                         const BaseIndex& mem) DEFINED_ON(x64);
+ 
+   void atomicEffectOp64(Synchronization sync, AtomicOp op, Register64 value,
+                         const BaseIndex& mem, Register64 temp)
+-      DEFINED_ON(arm, arm64, mips64, loong64, riscv64);
++      DEFINED_ON(arm, arm64, mips64, loong64, riscv64, ppc64);
+ 
+   // 64-bit atomic load. On 64-bit systems, use regular load with
+   // Synchronization::Load, not this method.
+@@ -4481,14 +4489,14 @@ class MacroAssembler : public MacroAssemblerSpecific {
+                            Register replacement, Register valueTemp,
+                            Register offsetTemp, Register maskTemp,
+                            Register output)
+-      DEFINED_ON(mips64, loong64, riscv64);
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   void wasmCompareExchange(const wasm::MemoryAccessDesc& access,
+                            const BaseIndex& mem, Register expected,
+                            Register replacement, Register valueTemp,
+                            Register offsetTemp, Register maskTemp,
+                            Register output)
+-      DEFINED_ON(mips64, loong64, riscv64);
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   void wasmAtomicExchange(const wasm::MemoryAccessDesc& access,
+                           const Address& mem, Register value, Register output)
+@@ -4502,13 +4510,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
+                           const Address& mem, Register value,
+                           Register valueTemp, Register offsetTemp,
+                           Register maskTemp, Register output)
+-      DEFINED_ON(mips64, loong64, riscv64);
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   void wasmAtomicExchange(const wasm::MemoryAccessDesc& access,
+                           const BaseIndex& mem, Register value,
+                           Register valueTemp, Register offsetTemp,
+                           Register maskTemp, Register output)
+-      DEFINED_ON(mips64, loong64, riscv64);
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   void wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access, AtomicOp op,
+                          Register value, const Address& mem, Register temp,
+@@ -4529,13 +4537,14 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   void wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access, AtomicOp op,
+                          Register value, const Address& mem, Register valueTemp,
+                          Register offsetTemp, Register maskTemp,
+-                         Register output) DEFINED_ON(mips64, loong64, riscv64);
++                         Register output)
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   void wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access, AtomicOp op,
+                          Register value, const BaseIndex& mem,
+                          Register valueTemp, Register offsetTemp,
+                          Register maskTemp, Register output)
+-      DEFINED_ON(mips64, loong64, riscv64);
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   // Read-modify-write with memory.  Return no value.
+   //
+@@ -4562,13 +4571,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
+                           Register value, const Address& mem,
+                           Register valueTemp, Register offsetTemp,
+                           Register maskTemp)
+-      DEFINED_ON(mips64, loong64, riscv64);
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   void wasmAtomicEffectOp(const wasm::MemoryAccessDesc& access, AtomicOp op,
+                           Register value, const BaseIndex& mem,
+                           Register valueTemp, Register offsetTemp,
+                           Register maskTemp)
+-      DEFINED_ON(mips64, loong64, riscv64);
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   // 64-bit wide operations.
+ 
+@@ -4626,12 +4635,12 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   void wasmAtomicFetchOp64(const wasm::MemoryAccessDesc& access, AtomicOp op,
+                            Register64 value, const Address& mem,
+                            Register64 temp, Register64 output)
+-      DEFINED_ON(arm, arm64, mips64, loong64, riscv64, x64);
++      DEFINED_ON(arm, arm64, mips64, loong64, riscv64, ppc64, x64);
+ 
+   void wasmAtomicFetchOp64(const wasm::MemoryAccessDesc& access, AtomicOp op,
+                            Register64 value, const BaseIndex& mem,
+                            Register64 temp, Register64 output)
+-      DEFINED_ON(arm, arm64, mips64, loong64, riscv64, x64);
++      DEFINED_ON(arm, arm64, mips64, loong64, riscv64, ppc64, x64);
+ 
+   void wasmAtomicFetchOp64(const wasm::MemoryAccessDesc& access, AtomicOp op,
+                            const Address& value, const Address& mem,
+@@ -4684,14 +4693,14 @@ class MacroAssembler : public MacroAssemblerSpecific {
+                          Register replacement, Register valueTemp,
+                          Register offsetTemp, Register maskTemp, Register temp,
+                          AnyRegister output)
+-      DEFINED_ON(mips64, loong64, riscv64);
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   void compareExchangeJS(Scalar::Type arrayType, Synchronization sync,
+                          const BaseIndex& mem, Register expected,
+                          Register replacement, Register valueTemp,
+                          Register offsetTemp, Register maskTemp, Register temp,
+                          AnyRegister output)
+-      DEFINED_ON(mips64, loong64, riscv64);
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   void atomicExchangeJS(Scalar::Type arrayType, Synchronization sync,
+                         const Address& mem, Register value, Register temp,
+@@ -4705,13 +4714,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
+                         const Address& mem, Register value, Register valueTemp,
+                         Register offsetTemp, Register maskTemp, Register temp,
+                         AnyRegister output)
+-      DEFINED_ON(mips64, loong64, riscv64);
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   void atomicExchangeJS(Scalar::Type arrayType, Synchronization sync,
+                         const BaseIndex& mem, Register value,
+                         Register valueTemp, Register offsetTemp,
+                         Register maskTemp, Register temp, AnyRegister output)
+-      DEFINED_ON(mips64, loong64, riscv64);
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   void atomicFetchOpJS(Scalar::Type arrayType, Synchronization sync,
+                        AtomicOp op, Register value, const Address& mem,
+@@ -4737,13 +4746,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
+                        AtomicOp op, Register value, const Address& mem,
+                        Register valueTemp, Register offsetTemp,
+                        Register maskTemp, Register temp, AnyRegister output)
+-      DEFINED_ON(mips64, loong64, riscv64);
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   void atomicFetchOpJS(Scalar::Type arrayType, Synchronization sync,
+                        AtomicOp op, Register value, const BaseIndex& mem,
+                        Register valueTemp, Register offsetTemp,
+                        Register maskTemp, Register temp, AnyRegister output)
+-      DEFINED_ON(mips64, loong64, riscv64);
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   void atomicEffectOpJS(Scalar::Type arrayType, Synchronization sync,
+                         AtomicOp op, Register value, const Address& mem,
+@@ -4764,12 +4773,14 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   void atomicEffectOpJS(Scalar::Type arrayType, Synchronization sync,
+                         AtomicOp op, Register value, const Address& mem,
+                         Register valueTemp, Register offsetTemp,
+-                        Register maskTemp) DEFINED_ON(mips64, loong64, riscv64);
++                        Register maskTemp)
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   void atomicEffectOpJS(Scalar::Type arrayType, Synchronization sync,
+                         AtomicOp op, Register value, const BaseIndex& mem,
+                         Register valueTemp, Register offsetTemp,
+-                        Register maskTemp) DEFINED_ON(mips64, loong64, riscv64);
++                        Register maskTemp)
++      DEFINED_ON(mips64, loong64, riscv64, ppc64);
+ 
+   void atomicIsLockFreeJS(Register value, Register output);
+ 
+@@ -5928,7 +5939,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+   inline void addStackPtrTo(T t);
+ 
+   void subFromStackPtr(Imm32 imm32)
+-      DEFINED_ON(mips64, loong64, riscv64, wasm32, arm, x86, x64);
++      DEFINED_ON(mips64, loong64, riscv64, ppc64, wasm32, arm, x86, x64);
+   void subFromStackPtr(Register reg);
+ 
+   template <typename T>
+diff --git a/js/src/jit/MoveEmitter.h b/js/src/jit/MoveEmitter.h
+index 642829c070d6..3a883c596ca0 100644
+--- a/js/src/jit/MoveEmitter.h
++++ b/js/src/jit/MoveEmitter.h
+@@ -17,6 +17,8 @@
+ #  include "jit/loong64/MoveEmitter-loong64.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ #  include "jit/riscv64/MoveEmitter-riscv64.h"
++#elif defined(JS_CODEGEN_PPC64)
++#  include "jit/ppc64/MoveEmitter-ppc64.h"
+ #elif defined(JS_CODEGEN_WASM32)
+ #  include "jit/wasm32/MoveEmitter-wasm32.h"
+ #elif defined(JS_CODEGEN_NONE)
+diff --git a/js/src/jit/MoveResolver.cpp b/js/src/jit/MoveResolver.cpp
+index d2e1f12700bd..8e622407a0a8 100644
+--- a/js/src/jit/MoveResolver.cpp
++++ b/js/src/jit/MoveResolver.cpp
+@@ -57,6 +57,22 @@ bool MoveResolver::addMove(const MoveOperand& from, const MoveOperand& to,
+                            MoveOp::Type type) {
+   // Assert that we're not doing no-op moves.
+   MOZ_ASSERT(!(from == to));
++#ifdef JS_CODEGEN_PPC64
++  // PPC64 FloatRegisters expose Single/Double kinds that have distinct code()
++  // values but share one physical register. The register allocator can emit a
++  // move between two such kind-views of the same FPR (e.g. f2-Double to
++  // f2-Single); these are no-ops on the hardware, are not caught by the
++  // (from == to) assert above, and would otherwise trip the
++  // !from().aliases(to()) invariant the resolver relies on later. Drop them.
++  //
++  // This would be correct for any backend whose FloatRegister has multiple
++  // kinds aliasing one physical register, and could be un-gated if another
++  // such backend needs it, but it is scoped to PPC64 so move resolution on
++  // tier-1 platforms is left unchanged.
++  if (from.aliases(to)) {
++    return true;
++  }
++#endif
+   PendingMove* pm = movePool_.allocate(from, to, type);
+   if (!pm) {
+     return false;
+diff --git a/js/src/jit/RegisterAllocator.h b/js/src/jit/RegisterAllocator.h
+index eda9933f6322..42e48111046a 100644
+--- a/js/src/jit/RegisterAllocator.h
++++ b/js/src/jit/RegisterAllocator.h
+@@ -262,9 +262,10 @@ class RegisterAllocator {
+  public:
+   template <typename TakeableSet>
+   static void takeWasmRegisters(TakeableSet& regs) {
+-#if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM) ||      \
+-    defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
+-    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM) ||         \
++    defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) ||    \
++    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || \
++    defined(JS_CODEGEN_PPC64)
+     regs.take(HeapReg);
+ #endif
+     MOZ_ASSERT(!regs.has(FramePointer));
+diff --git a/js/src/jit/Registers.h b/js/src/jit/Registers.h
+index e0d02e2fb60d..423777ce38cd 100644
+--- a/js/src/jit/Registers.h
++++ b/js/src/jit/Registers.h
+@@ -20,6 +20,8 @@
+ #  include "jit/loong64/Architecture-loong64.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ #  include "jit/riscv64/Architecture-riscv64.h"
++#elif defined(JS_CODEGEN_PPC64)
++#  include "jit/ppc64/Architecture-ppc64.h"
+ #elif defined(JS_CODEGEN_WASM32)
+ #  include "jit/wasm32/Architecture-wasm32.h"
+ #elif defined(JS_CODEGEN_NONE)
+diff --git a/js/src/jit/Safepoints.cpp b/js/src/jit/Safepoints.cpp
+index 42e305f053af..8e3a25c3c5ff 100644
+--- a/js/src/jit/Safepoints.cpp
++++ b/js/src/jit/Safepoints.cpp
+@@ -63,6 +63,11 @@ static void WriteFloatRegisterMask(CompactBufferWriter& stream,
+       stream.writeUnsigned64(bits.low());
+       stream.writeUnsigned64(bits.high());
+       break;
++#elif defined(JS_CODEGEN_PPC64)
++    case 16:
++      stream.writeUnsigned64(static_cast<uint64_t>(bits));
++      stream.writeUnsigned64(static_cast<uint64_t>(bits >> 64));
++      break;
+ #else
+     case 1:
+       stream.writeByte(bits);
+@@ -88,6 +93,12 @@ static FloatRegisters::SetType ReadFloatRegisterMask(
+       uint64_t high = stream.readUnsigned64();
+       return Bitset128(high, low);
+     }
++#elif defined(JS_CODEGEN_PPC64)
++    case 16: {
++      uint64_t low = stream.readUnsigned64();
++      uint64_t high = stream.readUnsigned64();
++      return FloatRegisters::SetType(high) << 64 | FloatRegisters::SetType(low);
++    }
+ #else
+     case 1:
+       return stream.readByte();
+diff --git a/js/src/jit/SharedICHelpers-inl.h b/js/src/jit/SharedICHelpers-inl.h
+index eedccc831732..1005b140f1df 100644
+--- a/js/src/jit/SharedICHelpers-inl.h
++++ b/js/src/jit/SharedICHelpers-inl.h
+@@ -19,6 +19,8 @@
+ #  include "jit/loong64/SharedICHelpers-loong64-inl.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ #  include "jit/riscv64/SharedICHelpers-riscv64-inl.h"
++#elif defined(JS_CODEGEN_PPC64)
++#  include "jit/ppc64/SharedICHelpers-ppc64-inl.h"
+ #elif defined(JS_CODEGEN_WASM32)
+ #  include "jit/wasm32/SharedICHelpers-wasm32-inl.h"
+ #elif defined(JS_CODEGEN_NONE)
+diff --git a/js/src/jit/SharedICHelpers.h b/js/src/jit/SharedICHelpers.h
+index 1ebd61e44509..f2703c6f986c 100644
+--- a/js/src/jit/SharedICHelpers.h
++++ b/js/src/jit/SharedICHelpers.h
+@@ -19,6 +19,8 @@
+ #  include "jit/loong64/SharedICHelpers-loong64.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ #  include "jit/riscv64/SharedICHelpers-riscv64.h"
++#elif defined(JS_CODEGEN_PPC64)
++#  include "jit/ppc64/SharedICHelpers-ppc64.h"
+ #elif defined(JS_CODEGEN_WASM32)
+ #  include "jit/wasm32/SharedICHelpers-wasm32.h"
+ #elif defined(JS_CODEGEN_NONE)
+diff --git a/js/src/jit/SharedICRegisters.h b/js/src/jit/SharedICRegisters.h
+index c3ab86bf0a82..5b270d0c166a 100644
+--- a/js/src/jit/SharedICRegisters.h
++++ b/js/src/jit/SharedICRegisters.h
+@@ -19,6 +19,8 @@
+ #  include "jit/loong64/SharedICRegisters-loong64.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ #  include "jit/riscv64/SharedICRegisters-riscv64.h"
++#elif defined(JS_CODEGEN_PPC64)
++#  include "jit/ppc64/SharedICRegisters-ppc64.h"
+ #elif defined(JS_CODEGEN_WASM32)
+ #  include "jit/wasm32/SharedICRegisters-wasm32.h"
+ #elif defined(JS_CODEGEN_NONE)
+diff --git a/js/src/jit/Simulator.h b/js/src/jit/Simulator.h
+index 39503716f10d..9f60baf53198 100644
+--- a/js/src/jit/Simulator.h
++++ b/js/src/jit/Simulator.h
+@@ -15,6 +15,8 @@
+ #  include "jit/loong64/Simulator-loong64.h"
+ #elif defined(JS_SIMULATOR_RISCV64)
+ #  include "jit/riscv64/Simulator-riscv64.h"
++#elif defined(JS_SIMULATOR_PPC64)
++#  include "jit/ppc64/Simulator-ppc64.h"
+ #elif defined(JS_SIMULATOR)
+ #  error "Unexpected simulator platform"
+ #endif
+diff --git a/js/src/jit/moz.build b/js/src/jit/moz.build
+index 5b5df3e5b7b2..36ef65d6221a 100644
+--- a/js/src/jit/moz.build
++++ b/js/src/jit/moz.build
+@@ -228,6 +228,18 @@ elif CONFIG["JS_CODEGEN_LOONG64"]:
+     ]
+     if CONFIG["JS_SIMULATOR_LOONG64"]:
+         UNIFIED_SOURCES += ["loong64/Simulator-loong64.cpp"]
++elif CONFIG["JS_CODEGEN_PPC64"]:
++    UNIFIED_SOURCES += [
++        "ppc64/Architecture-ppc64.cpp",
++        "ppc64/Assembler-ppc64.cpp",
++        "ppc64/CodeGenerator-ppc64.cpp",
++        "ppc64/Lowering-ppc64.cpp",
++        "ppc64/MacroAssembler-ppc64.cpp",
++        "ppc64/MoveEmitter-ppc64.cpp",
++        "ppc64/Trampoline-ppc64.cpp",
++    ]
++    if CONFIG["JS_SIMULATOR_PPC64"]:
++        UNIFIED_SOURCES += ["ppc64/Simulator-ppc64.cpp"]
+ elif CONFIG["JS_CODEGEN_RISCV64"]:
+     UNIFIED_SOURCES += [
+         "riscv64/Architecture-riscv64.cpp",
+diff --git a/js/src/jit/ppc64/Architecture-ppc64.cpp b/js/src/jit/ppc64/Architecture-ppc64.cpp
+new file mode 100644
+index 000000000000..5632865556ac
+--- /dev/null
++++ b/js/src/jit/ppc64/Architecture-ppc64.cpp
+@@ -0,0 +1,221 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#include "jit/ppc64/Architecture-ppc64.h"
++
++#ifndef JS_SIMULATOR
++#  include <sys/auxv.h>
++#endif
++
++#include "jit/FlushICache.h"  // js::jit::FlushICache
++#include "jit/RegisterSets.h"
++
++namespace js {
++namespace jit {
++
++Registers::Code Registers::FromName(const char* name) {
++  for (size_t i = 0; i < Total; i++) {
++    if (strcmp(GetName(i), name) == 0) {
++      return Code(i);
++    }
++  }
++
++  return Invalid;
++}
++
++FloatRegisters::Code FloatRegisters::FromName(const char* name) {
++  for (size_t i = 0; i < Total; i++) {
++    if (strcmp(GetName(i), name) == 0) {
++      return Code(i);
++    }
++  }
++
++  return Invalid;
++}
++
++FloatRegisterSet FloatRegister::ReduceSetForPush(const FloatRegisterSet& s) {
++  SetType all = s.bits();
++  SetType simd128Set =
++      (all >> (uint32_t(FloatRegisters::Simd128) * FloatRegisters::TotalPhys)) &
++      FloatRegisters::AllPhysMask;
++  SetType doubleSet =
++      (all >> (uint32_t(FloatRegisters::Double) * FloatRegisters::TotalPhys)) &
++      FloatRegisters::AllPhysMask;
++  SetType singleSet =
++      (all >> (uint32_t(FloatRegisters::Single) * FloatRegisters::TotalPhys)) &
++      FloatRegisters::AllPhysMask;
++
++  // Single+Double share physical FPRs (push as Double, 8-byte slot);
++  // Simd128 lives in its own physical VRs (push as Simd128, 16-byte
++  // slot). Different physical pools — no dedup. Note that
++  // sizeof(FloatRegisters::RegisterContent) is 8 bytes (no v128 in the
++  // union), so RegisterDump::FPUArray is 32 × 8 = 256 bytes, matching
++  // the Float-only layout PushRegsInMask produces.
++  SetType set64 = singleSet | doubleSet;
++
++  SetType reduced =
++      (simd128Set << (uint32_t(FloatRegisters::Simd128) *
++                      FloatRegisters::TotalPhys)) |
++      (set64 << (uint32_t(FloatRegisters::Double) * FloatRegisters::TotalPhys));
++  return FloatRegisterSet(reduced);
++}
++
++uint32_t FloatRegister::GetPushSizeInBytes(const FloatRegisterSet& s) {
++  SetType all = s.bits();
++  SetType simd128Set =
++      (all >> (uint32_t(FloatRegisters::Simd128) * FloatRegisters::TotalPhys)) &
++      FloatRegisters::AllPhysMask;
++  SetType doubleSet =
++      (all >> (uint32_t(FloatRegisters::Double) * FloatRegisters::TotalPhys)) &
++      FloatRegisters::AllPhysMask;
++  SetType singleSet =
++      (all >> (uint32_t(FloatRegisters::Single) * FloatRegisters::TotalPhys)) &
++      FloatRegisters::AllPhysMask;
++
++  // Natural per-kind slot sizes. See ReduceSetForPush comment.
++  SetType set64 = singleSet | doubleSet;
++
++  uint32_t count64 = std::popcount(static_cast<uint64_t>(set64));
++  uint32_t count128 = std::popcount(static_cast<uint64_t>(simd128Set));
++
++  return count64 * sizeof(double) + count128 * 16;
++}
++
++uint32_t FloatRegister::getRegisterDumpOffsetInBytes() {
++  // Simd128 encoding is 32-63 — mask back to 0-31 for the FPUArray-
++  // relative offset. (FPUArray has 32 slots; Simd128 should never be in
++  // a SafepointState/BailoutState anyway.)
++  return (encoding() & 31) * sizeof(FloatRegisters::RegisterContent);
++}
++
++static bool sPOWER9Detected = false;
++static bool sPOWER10Detected = false;
++static bool sCPUFlagsComputed = false;
++
++#ifndef JS_SIMULATOR
++// Cache line sizes, detected at startup from ELF auxiliary vector.
++// Fallback to 32 bytes (safe minimum per LuaJIT/LLVM compiler-rt).
++static size_t sDCacheLineSize = 0;
++static size_t sICacheLineSize = 0;
++#endif
++
++void PPC64Flags::Init() {
++  if (sCPUFlagsComputed) {
++    return;
++  }
++#ifndef JS_SIMULATOR
++  unsigned long hwcap2 = getauxval(AT_HWCAP2);
++  // PPC_FEATURE2_ARCH_3_00 = 0x00800000 (ISA 3.0 / POWER9)
++  sPOWER9Detected = (hwcap2 & 0x00800000) != 0;
++  // PPC_FEATURE2_ARCH_3_1 = 0x00040000 (ISA 3.1 / POWER10)
++  sPOWER10Detected = (hwcap2 & 0x00040000) != 0;
++  // Allow forcing POWER8 mode for testing: MOZ_PPC64_FORCE_POWER8=1.
++  // P10 implies P9; downgrade clears both.
++  const char* forceP8 = getenv("MOZ_PPC64_FORCE_POWER8");
++  if (forceP8 && forceP8[0] == '1') {
++    sPOWER9Detected = false;
++    sPOWER10Detected = false;
++  }
++
++  size_t dcache = getauxval(AT_DCACHEBSIZE);
++  size_t icache = getauxval(AT_ICACHEBSIZE);
++  sDCacheLineSize = dcache ? dcache : 32;
++  sICacheLineSize = icache ? icache : 32;
++#endif
++  // FORCE_POWER9/10 opt into the corresponding ISA fast paths. Useful under
++  // the simulator; on real silicon below the gated level they are foot-guns
++  // because the CPU will trap on undefined ops. Outside the JS_SIMULATOR
++  // guard so the sim can opt in via env.
++  //
++  // FORCE_POWER10 also implies FORCE_POWER9 — this matches what real-P10
++  // silicon advertises in hwcap2 (both ARCH_3_00 and ARCH_3_1 bits set), so
++  // we don't ask sim users to pass both vars separately.
++  const char* forceP9 = getenv("MOZ_PPC64_FORCE_POWER9");
++  if (forceP9 && forceP9[0] == '1') {
++    sPOWER9Detected = true;
++  }
++  const char* forceP10 = getenv("MOZ_PPC64_FORCE_POWER10");
++  if (forceP10 && forceP10[0] == '1') {
++    sPOWER10Detected = true;
++    sPOWER9Detected = true;
++  }
++  sCPUFlagsComputed = true;
++}
++
++bool HasPOWER9() {
++  MOZ_ASSERT(sCPUFlagsComputed);
++  return sPOWER9Detected;
++}
++
++bool HasPOWER10() {
++  MOZ_ASSERT(sCPUFlagsComputed);
++  return sPOWER10Detected;
++}
++
++bool CPUFlagsHaveBeenComputed() { return sCPUFlagsComputed; }
++
++// Per-bit feature flags packed into the wasm code signature. Adding a
++// new bit (e.g., POWER10, VSX4) should be a 1-line change here plus a
++// corresponding HasPOWER10()/IsVSX4Available() probe above. The value
++// is also assert-checked into a fixed-width field in
++// js/src/wasm/WasmCompile.cpp — if that field ever overflows, widen
++// it there before landing more bits here.
++uint32_t GetPPC64Flags() {
++  uint32_t flags = 0;
++  if (sPOWER9Detected) {
++    flags |= PPC64Flag_POWER9;
++  }
++  return flags;
++}
++
++void FlushICache(void* code, size_t size) {
++#if defined(JS_SIMULATOR)
++  js::jit::SimulatorProcess::FlushICache(code, size);
++#else
++  // PPC64 has incoherent I/D caches. GCC's __builtin___clear_cache is a
++  // no-op on PPC64 Linux, so we implement the flush explicitly.
++  // This follows the same approach as QEMU (util/cacheflush.c) and the
++  // Linux kernel (arch/powerpc/mm/cacheflush.c):
++  //   dcbst loop -> sync -> icbi loop -> sync -> isync
++  if (!size) {
++    return;
++  }
++  MOZ_ASSERT(sCPUFlagsComputed,
++             "PPC64Flags::Init must run before any FlushICache call");
++
++  uintptr_t start = reinterpret_cast<uintptr_t>(code);
++  uintptr_t end = start + size;
++
++  // Step 1: Write back data cache to memory.
++  for (uintptr_t addr = start & ~(sDCacheLineSize - 1); addr < end;
++       addr += sDCacheLineSize) {
++    asm volatile("dcbst 0, %0" : : "r"(addr) : "memory");
++  }
++  asm volatile("sync" ::: "memory");
++
++  // Step 2: Invalidate instruction cache.
++  for (uintptr_t addr = start & ~(sICacheLineSize - 1); addr < end;
++       addr += sICacheLineSize) {
++    asm volatile("icbi 0, %0" : : "r"(addr) : "memory");
++  }
++  // The extra sync before isync matches the Linux kernel and QEMU.
++  // It ensures all icbi operations complete before the pipeline flush.
++  asm volatile("sync" ::: "memory");
++  asm volatile("isync" ::: "memory");
++#endif
++}
++
++void FlushExecutionContext() {
++#if !defined(JS_SIMULATOR)
++  // PPC64's isync flushes the instruction pipeline on the current core,
++  // ensuring any previously invalidated icache entries are discarded and
++  // instructions are re-fetched from coherent memory.
++  asm volatile("isync" ::: "memory");
++#endif
++}
++
++}  // namespace jit
++}  // namespace js
+diff --git a/js/src/jit/ppc64/Architecture-ppc64.h b/js/src/jit/ppc64/Architecture-ppc64.h
+new file mode 100644
+index 000000000000..efaab0b0c854
+--- /dev/null
++++ b/js/src/jit/ppc64/Architecture-ppc64.h
+@@ -0,0 +1,581 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_Architecture_ppc64_h
++#define jit_ppc64_Architecture_ppc64_h
++
++#include <algorithm>
++#include <bit>
++
++#include "jit/shared/Architecture-shared.h"
++
++#include "js/Utility.h"
++
++namespace js {
++namespace jit {
++
++// PPC64 has 32 64-bit general purpose registers, r0 through r31.
++// The program counter is not directly accessible as a register.
++// The link register (LR) and count register (CTR) are SPRs.
++
++// PPC64 ELFv2 GPR Convention:
++//  Name    Usage
++//  r0      Volatile, cannot be base register in load/store
++//  r1      Stack pointer (callee-saved)
++//  r2      TOC pointer (reserved)
++//  r3      Return value / first argument
++//  r4-r10  Arguments 2-8
++//  r11     Environment pointer / scratch
++//  r12     Branch target / scratch
++//  r13     Thread pointer (reserved, TLS)
++//  r14-r31 Callee-saved
++
++// PPC64 ELFv2 FPR Convention:
++//  f0      Scratch
++//  f1-f13  Arguments / volatile
++//  f14-f31 Callee-saved
++
++class Registers {
++ public:
++  enum RegisterID {
++    r0 = 0,
++    r1,
++    r2,
++    r3,
++    r4,
++    r5,
++    r6,
++    r7,
++    r8,
++    r9,
++    r10,
++    r11,
++    r12,
++    r13,
++    r14,
++    r15,
++    r16,
++    r17,
++    r18,
++    r19,
++    r20,
++    r21,
++    r22,
++    r23,
++    r24,
++    r25,
++    r26,
++    r27,
++    r28,
++    r29,
++    r30,
++    r31,
++    sp = r1,
++    invalid_reg,
++  };
++  typedef uint8_t Code;
++  typedef RegisterID Encoding;
++  typedef uint32_t SetType;
++
++  static const Encoding StackPointer = sp;
++  static const Encoding Invalid = invalid_reg;
++
++  union RegisterContent {
++    uintptr_t r;
++  };
++
++  static uint32_t SetSize(SetType x) { return std::popcount(x); }
++  static uint32_t FirstBit(SetType x) {
++    MOZ_ASSERT(x);
++    return std::countr_zero(x);
++  }
++  static uint32_t LastBit(SetType x) {
++    MOZ_ASSERT(x);
++    return std::bit_width(x) - 1;
++  }
++
++  static const char* GetName(uint32_t code) {
++    static const char* const Names[] = {
++        "r0",  "sp",  "r2",  "r3",  "r4",  "r5",  "r6",  "r7",
++        "r8",  "r9",  "r10", "r11", "r12", "r13", "r14", "r15",
++        "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
++        "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31"};
++    static_assert(Total == std::size(Names), "Table is the correct size");
++    if (code >= Total) {
++      return "invalid";
++    }
++    return Names[code];
++  }
++
++  static Code FromName(const char* name);
++
++  static const uint32_t Total = 32;
++  static const uint32_t TotalPhys = 32;
++  static const uint32_t Allocatable = 24;
++
++  static const SetType AllMask = 0xFFFFFFFF;
++  static const SetType NoneMask = 0x0;
++
++  static const SetType ArgRegMask =
++      (1U << Registers::r3) | (1U << Registers::r4) | (1U << Registers::r5) |
++      (1U << Registers::r6) | (1U << Registers::r7) | (1U << Registers::r8) |
++      (1U << Registers::r9) | (1U << Registers::r10);
++
++  // r0, r11, r12 are also volatile but handled separately.
++  static const SetType VolatileMask = ArgRegMask;
++
++  // ELFv2 callee-saved GPRs are r14..r31. r2 (TOC) and r13 (TLS) are
++  // dedicated registers, NOT general callee-saved: r2 is restored by the
++  // PLT-call linkage convention (`ld r2, 24(r1)` after every cross-module
++  // call); r13 is the thread pointer and must NEVER be written. Including
++  // them here previously made `PushRegsInMask(NonVolatileMask)` save and
++  // restore them — wasted 16 bytes per wasm-stub frame at best, latent
++  // TLS corruption if save/restore were ever misordered. Verified that
++  // no JIT-emitted code writes r2 or r13 (both are NonAllocatable, and
++  // grep across js/src/jit/ppc64/ finds no `as_*` site assigning to
++  // them), so they're preserved across the JIT body for free.
++  static const SetType NonVolatileMask =
++      (1U << Registers::r14) |
++      (1U << Registers::r15) | (1U << Registers::r16) | (1U << Registers::r17) |
++      (1U << Registers::r18) | (1U << Registers::r19) | (1U << Registers::r20) |
++      (1U << Registers::r21) | (1U << Registers::r22) | (1U << Registers::r23) |
++      (1U << Registers::r24) | (1U << Registers::r25) | (1U << Registers::r26) |
++      (1U << Registers::r27) | (1U << Registers::r28) | (1U << Registers::r29) |
++      (1U << Registers::r30) | (1U << Registers::r31);
++
++  static const SetType NonAllocatableMask =
++      (1U << Registers::r0) |   // Cannot be base in load/store.
++      (1U << Registers::sp) |   // Stack pointer.
++      (1U << Registers::r2) |   // TOC pointer (ELFv2).
++      (1U << Registers::r11) |  // Third scratch.
++      (1U << Registers::r12) |  // Second scratch / addressTempRegister.
++      (1U << Registers::r13) |  // Thread-local storage (ELFv2).
++      (1U << Registers::r16) |  // Saved scratch register.
++      (1U << Registers::r31);   // Frame pointer.
++
++  static const SetType WrapperMask = VolatileMask;
++
++  // Registers returned from a JS -> JS call.
++  static const SetType JSCallMask = (1U << Registers::r5);
++
++  // Registers returned from a JS -> C call.
++  static const SetType CallMask = (1U << Registers::r3);
++
++  static const SetType AllocatableMask = AllMask & ~NonAllocatableMask;
++};
++
++typedef uint32_t PackedRegisterMask;
++
++template <typename T>
++class TypedRegisterSet;
++
++class FloatRegisters {
++ public:
++  enum FPRegisterID {
++    f0 = 0,
++    f1,
++    f2,
++    f3,
++    f4,
++    f5,
++    f6,
++    f7,
++    f8,
++    f9,
++    f10,
++    f11,
++    f12,
++    f13,
++    f14,
++    f15,
++    f16,
++    f17,
++    f18,
++    f19,
++    f20,
++    f21,
++    f22,
++    f23,
++    f24,
++    f25,
++    f26,
++    f27,
++    f28,
++    f29,
++    f30,
++    f31,
++  };
++
++  // Eight bits: (invalid << 7) | (kind << 5) | encoding
++  typedef uint8_t Code;
++  typedef FPRegisterID Encoding;
++  // 3 kinds × 32 regs = 96 bits needed. Use __uint128_t.
++  typedef __uint128_t SetType;
++
++  enum Kind : uint8_t { Double, Single, Simd128, NumTypes };
++
++  static constexpr Code Invalid = 0x80;
++
++  static const char* GetName(uint32_t code) {
++    static const char* const Names[] = {
++        "f0",  "f1",  "f2",  "f3",  "f4",  "f5",  "f6",  "f7",
++        "f8",  "f9",  "f10", "f11", "f12", "f13", "f14", "f15",
++        "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",
++        "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31"};
++    static_assert(TotalPhys == std::size(Names), "Table is the correct size");
++    if (code >= Total) {
++      return "invalid";
++    }
++    return Names[code % TotalPhys];
++  }
++
++  static Code FromName(const char* name);
++
++  static const uint32_t TotalPhys = 32;
++  static const uint32_t Total = TotalPhys * NumTypes;
++  static const uint32_t Allocatable = 31;  // Without f0, the scratch register.
++
++  static_assert(sizeof(SetType) * 8 >= Total,
++                "SetType should be large enough to enumerate all registers.");
++
++  static const SetType SpreadSingle = SetType(1)
++                                      << (uint32_t(Single) * TotalPhys);
++  static const SetType SpreadDouble = SetType(1)
++                                      << (uint32_t(Double) * TotalPhys);
++  static const SetType SpreadSimd128 = SetType(1)
++                                       << (uint32_t(Simd128) * TotalPhys);
++  static const SetType Spread = SpreadSingle | SpreadDouble | SpreadSimd128;
++
++  static const SetType AllPhysMask = ((SetType(1) << TotalPhys) - 1);
++  static const SetType AllMask = AllPhysMask * Spread;
++  static const SetType AllSingleMask = AllPhysMask * SpreadSingle;
++  static const SetType AllDoubleMask = AllPhysMask * SpreadDouble;
++  static const SetType AllSimd128Mask = AllPhysMask * SpreadSimd128;
++  static const SetType NoneMask = SetType(0);
++
++  // ELFv2: f14-f31 are non-volatile (callee-saved) for scalar FP.
++  // The upper 64 bits of VSR 0-31 are volatile, so Simd128 view is all-volatile.
++  static const SetType NonVolatilePhysMask =
++      SetType((1U << FloatRegisters::f14) | (1U << FloatRegisters::f15) |
++              (1U << FloatRegisters::f16) | (1U << FloatRegisters::f17) |
++              (1U << FloatRegisters::f18) | (1U << FloatRegisters::f19) |
++              (1U << FloatRegisters::f20) | (1U << FloatRegisters::f21) |
++              (1U << FloatRegisters::f22) | (1U << FloatRegisters::f23) |
++              (1U << FloatRegisters::f24) | (1U << FloatRegisters::f25) |
++              (1U << FloatRegisters::f26) | (1U << FloatRegisters::f27) |
++              (1U << FloatRegisters::f28) | (1U << FloatRegisters::f29) |
++              (1U << FloatRegisters::f30) | (1U << FloatRegisters::f31));
++  // Simd128 lives in VR-namespace (VSR32-63 = VR0-VR31). Per ELFv2 ABI,
++  // VR20-VR31 are non-volatile (callee-saved). Encoding storage is 20-31
++  // with kind=Simd128.
++  static const SetType SimdNonVolatilePhysMask =
++      SetType((1U << 20) | (1U << 21) | (1U << 22) | (1U << 23) |
++              (1U << 24) | (1U << 25) | (1U << 26) | (1U << 27) |
++              (1U << 28) | (1U << 29) | (1U << 30) | (1U << 31));
++  static const SetType NonVolatileMask =
++      NonVolatilePhysMask * (SpreadSingle | SpreadDouble) |
++      SimdNonVolatilePhysMask * SpreadSimd128;
++
++  static const SetType VolatileMask = AllMask & ~NonVolatileMask;
++
++  static const SetType WrapperMask = VolatileMask;
++
++  // f0 is the scratch register (all three views: single, double, simd128).
++  static const SetType NonAllocatableMask =
++      (SetType(1) << FloatRegisters::f0) * Spread;
++
++  static const SetType AllocatableMask = AllMask & ~NonAllocatableMask;
++
++  union RegisterContent {
++    float s;
++    double d;
++    // No v128 here. Simd128 lives in physically-distinct VRs (VSR32-63)
++    // and never reaches RegisterDump (asserted by SafepointState; bailout
++    // AllRegs excludes Simd128). With v128 in the union, sizeof was 16,
++    // forcing PushRegsInMask to a 16-byte stride that mismatched
++    // addressOfRegister's 8-byte walk via (*iter).size().
++  };
++
++  static constexpr Encoding encoding(Code c) { return Encoding(c & 31); }
++
++  static constexpr Kind kind(Code c) { return Kind((c >> 5) & 3); }
++
++  static constexpr Code fromParts(uint32_t encoding, uint32_t kind,
++                                  uint32_t invalid) {
++    return Code((invalid << 7) | (kind << 5) | encoding);
++  }
++};
++
++// SpillSlotSize must fit the widest register class (Simd128 = 16 bytes).
++// We can't derive from sizeof(FloatRegisters::RegisterContent) — that
++// union is sized for FPRs only (8 bytes since v128 lives in distinct
++// VRs, not in the FPR union), so deriving would under-reserve for
++// Simd128 cycle breaks. SpillSlotSize is consumed only by MoveEmitter
++// and is not part of the JIT frame layout.
++static const uint32_t SpillSlotSize = 16;
++
++// PPC64 ELFv2 ABI: the callee saves LR at [caller_SP+16], CR at
++// [caller_SP+8], and may save TOC at [caller_SP+24]. Reserve 32 bytes
++// (the minimum ELFv2 stack frame) as a shadow area for every ABI call.
++static constexpr uint32_t ShadowStackSpace = 32;
++static const uint32_t SizeOfReturnAddressAfterCall = 0;
++
++// PPC64 branch instructions have a 26-bit signed offset field, giving a
++// range of +/- 32MB. We reduce this to leave room for jump island insertion.
++static constexpr uint32_t JumpImmediateRange = (32 * 1024 * 1024) - 32;
++
++// Size of each bailout table entry (a single bl instruction).
++static const uint32_t BAILOUT_TABLE_ENTRY_SIZE = 4;
++
++// PPC64 special purpose registers (not exposed to the allocator).
++enum SPRegisterID {
++  spr_xer = 1,
++  spr_lr = 8,
++  spr_ctr = 9,
++  spr_vrsave = 256,
++  invalid_spreg
++};
++
++// PPC64 condition registers.
++enum CRegisterID { cr0 = 0, cr1, cr5 = 5, cr6, cr7, invalid_creg };
++
++struct FloatRegister {
++  typedef FloatRegisters Codes;
++  typedef size_t Code;
++  typedef Codes::Encoding Encoding;
++  typedef Codes::SetType SetType;
++
++  static uint32_t SetSize(SetType x) {
++    // Fold all 3 kinds (Single, Double, Simd128) down to physical mask.
++    SetType phys = (x & FloatRegisters::AllPhysMask) |
++                   ((x >> FloatRegisters::TotalPhys) & FloatRegisters::AllPhysMask) |
++                   ((x >> (2 * FloatRegisters::TotalPhys)) & FloatRegisters::AllPhysMask);
++    return std::popcount(static_cast<uint64_t>(phys));
++  }
++
++  // __uint128_t helpers for FirstBit/LastBit.
++  static uint32_t FirstBit(SetType x) {
++    MOZ_ASSERT(x);
++    uint64_t lo = static_cast<uint64_t>(x);
++    if (lo) {
++      return std::countr_zero(lo);
++    }
++    return 64 + std::countr_zero(static_cast<uint64_t>(x >> 64));
++  }
++  static uint32_t LastBit(SetType x) {
++    MOZ_ASSERT(x);
++    uint64_t hi = static_cast<uint64_t>(x >> 64);
++    if (hi) {
++      return 64 + (std::bit_width(hi) - 1);
++    }
++    return std::bit_width(static_cast<uint64_t>(x)) - 1;
++  }
++
++ private:
++  uint8_t encoding_;
++  uint8_t kind_;
++  bool invalid_;
++
++  typedef Codes::Kind Kind;
++
++ public:
++  constexpr FloatRegister(Encoding encoding, Kind kind)
++      : encoding_(encoding), kind_(kind), invalid_(false) {}
++
++  constexpr FloatRegister()
++      : encoding_(0), kind_(FloatRegisters::Double), invalid_(true) {}
++
++  static FloatRegister FromCode(uint32_t i) {
++    MOZ_ASSERT(i < Codes::Total);
++    return FloatRegister(FloatRegisters::encoding(i), FloatRegisters::kind(i));
++  }
++
++  bool isSingle() const {
++    MOZ_ASSERT(!invalid_);
++    return kind_ == FloatRegisters::Single;
++  }
++  bool isDouble() const {
++    MOZ_ASSERT(!invalid_);
++    return kind_ == FloatRegisters::Double;
++  }
++  bool isSimd128() const {
++    MOZ_ASSERT(!invalid_);
++    return kind_ == FloatRegisters::Simd128;
++  }
++  bool isInvalid() const { return invalid_; }
++
++  FloatRegister asSingle() const {
++    MOZ_ASSERT(!invalid_);
++    return FloatRegister(Encoding(encoding_), FloatRegisters::Single);
++  }
++  FloatRegister asDouble() const {
++    MOZ_ASSERT(!invalid_);
++    return FloatRegister(Encoding(encoding_), FloatRegisters::Double);
++  }
++  FloatRegister asSimd128() const {
++    MOZ_ASSERT(!invalid_);
++    return FloatRegister(Encoding(encoding_), FloatRegisters::Simd128);
++  }
++
++  constexpr uint32_t size() const {
++    MOZ_ASSERT(!invalid_);
++    if (kind_ == FloatRegisters::Double) {
++      return sizeof(double);
++    }
++    if (kind_ == FloatRegisters::Single) {
++      return sizeof(float);
++    }
++    MOZ_ASSERT(kind_ == FloatRegisters::Simd128);
++    return 16;
++  }
++
++  constexpr Code code() const {
++    return Codes::fromParts(encoding_, kind_, invalid_);
++  }
++
++  constexpr Encoding encoding() const {
++    MOZ_ASSERT(!invalid_);
++    // Simd128 lives in VR-namespace at VSR32-63 (= VR0-31). Single/Double
++    // share FPR namespace at VSR0-31. The unified XX-form encoders split
++    // the result into low-5-bit VRT/VRA/VRB + TX/AX/BX bits; VMX
++    // FloatRegister-taking encoders mask with `& 31` for the raw VR
++    // field. So 32+E flows correctly through both paths.
++    return Encoding(encoding_ +
++                    (kind_ == FloatRegisters::Simd128 ? 32 : 0));
++  }
++
++  const char* name() const { return FloatRegisters::GetName(code()); }
++  bool volatile_() const {
++    MOZ_ASSERT(!invalid_);
++    return !!((SetType(1) << code()) & FloatRegisters::VolatileMask);
++  }
++  constexpr bool operator!=(FloatRegister other) const {
++    return code() != other.code();
++  }
++  constexpr bool operator==(FloatRegister other) const {
++    return code() == other.code();
++  }
++
++  bool aliases(FloatRegister other) const {
++    // Register-class partition: {Single, Double} share FPRs (VSR0-31);
++    // Simd128 lives in VR-namespace (VSR32-63). FPR f5 (Single/Double
++    // encoding 5) and VR v5 (Simd128 encoding 5) are distinct physical
++    // registers.
++    if (encoding_ != other.encoding_) return false;
++    bool selfSimd = (kind_ == FloatRegisters::Simd128);
++    bool otherSimd = (other.kind_ == FloatRegisters::Simd128);
++    return selfSimd == otherSimd;
++  }
++  bool equiv(FloatRegister other) const {
++    MOZ_ASSERT(!invalid_);
++    return kind_ == other.kind_;
++  }
++
++  uint32_t numAliased() const {
++    return (kind_ == FloatRegisters::Simd128) ? 1 : 2;
++  }
++  uint32_t numAlignedAliased() { return numAliased(); }
++
++  FloatRegister aliased(uint32_t aliasIdx) {
++    MOZ_ASSERT(!invalid_);
++    MOZ_ASSERT(aliasIdx < numAliased());
++    if (kind_ == FloatRegisters::Simd128) {
++      return *this;
++    }
++    Kind otherKind = (kind_ == FloatRegisters::Single)
++                         ? FloatRegisters::Double
++                         : FloatRegisters::Single;
++    Kind selectedKind = (aliasIdx == 0) ? Kind(kind_) : otherKind;
++    return FloatRegister(Encoding(encoding_), selectedKind);
++  }
++  FloatRegister alignedAliased(uint32_t aliasIdx) {
++    MOZ_ASSERT(aliasIdx < numAliased());
++    return aliased(aliasIdx);
++  }
++  SetType alignedOrDominatedAliasedSet() const {
++    if (kind_ == FloatRegisters::Simd128) {
++      return SetType(1) << ((uint32_t(FloatRegisters::Simd128) *
++                             FloatRegisters::TotalPhys) +
++                            encoding_);
++    }
++    return (Codes::SpreadSingle | Codes::SpreadDouble) << encoding_;
++  }
++
++  static constexpr RegTypeName DefaultType = RegTypeName::Float64;
++
++  template <RegTypeName Name = DefaultType>
++  static SetType LiveAsIndexableSet(SetType s) {
++    return SetType(0);
++  }
++
++  template <RegTypeName Name = DefaultType>
++  static SetType AllocatableAsIndexableSet(SetType s) {
++    static_assert(Name != RegTypeName::Any, "Allocatable set are not iterable");
++    return LiveAsIndexableSet<Name>(s);
++  }
++
++  static TypedRegisterSet<FloatRegister> ReduceSetForPush(
++      const TypedRegisterSet<FloatRegister>& s);
++  static uint32_t GetPushSizeInBytes(const TypedRegisterSet<FloatRegister>& s);
++  uint32_t getRegisterDumpOffsetInBytes();
++};
++
++template <>
++inline FloatRegister::SetType
++FloatRegister::LiveAsIndexableSet<RegTypeName::Float32>(SetType set) {
++  return set & FloatRegisters::AllSingleMask;
++}
++
++template <>
++inline FloatRegister::SetType
++FloatRegister::LiveAsIndexableSet<RegTypeName::Float64>(SetType set) {
++  return set & FloatRegisters::AllDoubleMask;
++}
++
++template <>
++inline FloatRegister::SetType
++FloatRegister::LiveAsIndexableSet<RegTypeName::Vector128>(SetType set) {
++  return set & FloatRegisters::AllSimd128Mask;
++}
++
++template <>
++inline FloatRegister::SetType
++FloatRegister::LiveAsIndexableSet<RegTypeName::Any>(SetType set) {
++  return set;
++}
++
++inline bool hasUnaliasedDouble() { return false; }
++inline bool hasMultiAlias() { return false; }
++
++// PPC64 feature bits packed into the value GetPPC64Flags() returns,
++// which feeds wasm/WasmCompile.cpp's per-architecture code signature.
++// Defined as enum constants (not enum class) so callers can OR/AND
++// freely. New bits should remain backward-compatible — older signatures
++// must keep meaning the same set of features.
++enum PPC64FeatureFlags : uint32_t {
++  PPC64Flag_POWER9 = 1u << 0,
++  // Future: PPC64Flag_POWER10 = 1u << 1, PPC64Flag_VSX4 = 1u << 2, ...
++};
++
++uint32_t GetPPC64Flags();
++
++class PPC64Flags final {
++ public:
++  PPC64Flags() = delete;
++
++  // PPC64Flags::Init is called from the JitContext constructor to read the
++  // hardware capabilities (via getauxval(AT_HWCAP2)). It must be called
++  // exactly once, before HasPOWER9()/HasPOWER10() are used.
++  static void Init();
++};
++
++bool HasPOWER9();
++bool HasPOWER10();
++
++}  // namespace jit
++}  // namespace js
++
++#endif /* jit_ppc64_Architecture_ppc64_h */
+diff --git a/js/src/jit/ppc64/Assembler-ppc64.cpp b/js/src/jit/ppc64/Assembler-ppc64.cpp
+new file mode 100644
+index 000000000000..481070c4c6d5
+--- /dev/null
++++ b/js/src/jit/ppc64/Assembler-ppc64.cpp
+@@ -0,0 +1,3028 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#include "jit/ppc64/Assembler-ppc64.h"
++
++#include "mozilla/DebugOnly.h"
++#include "mozilla/Maybe.h"
++
++#include "gc/Marking.h"
++#include "jit/AutoWritableJitCode.h"
++#include "jit/ExecutableAllocator.h"
++#include "jit/FlushICache.h"
++
++using mozilla::DebugOnly;
++
++using namespace js;
++using namespace js::jit;
++
++// ELFv2 ABI: 8 GPRs (r3-r10), 13 FPRs (f1-f13).
++// FP arguments also consume a GPR slot per ELFv2 convention.
++ABIArg ABIArgGenerator::next(MIRType type) {
++  switch (type) {
++    case MIRType::Int32:
++    case MIRType::Int64:
++    case MIRType::Pointer:
++    case MIRType::WasmAnyRef:
++    case MIRType::WasmArrayData:
++    case MIRType::StackResults: {
++      if (intRegIndex_ >= NumIntArgRegs) {
++        current_ = ABIArg(stackOffset_);
++        stackOffset_ += sizeof(uintptr_t);
++        break;
++      }
++      current_ = ABIArg(Register::FromCode(Registers::r3 + intRegIndex_));
++      intRegIndex_++;
++      break;
++    }
++    case MIRType::Float32:
++    case MIRType::Double: {
++      if (floatRegIndex_ == NumFloatArgRegs) {
++        current_ = ABIArg(stackOffset_);
++        stackOffset_ += sizeof(double);
++        break;
++      }
++      current_ = ABIArg(FloatRegister(
++          FloatRegisters::Encoding(FloatRegisters::f1 + floatRegIndex_),
++          type == MIRType::Double ? FloatRegisters::Double
++                                  : FloatRegisters::Single));
++      floatRegIndex_++;
++      // ELFv2 ABI: each FP arg also consumes a GPR slot (shadow).
++      // Cap at NumIntArgRegs so subsequent int args go to the stack.
++      if (intRegIndex_ < NumIntArgRegs) {
++        intRegIndex_++;
++      }
++      break;
++    }
++    case MIRType::Simd128: {
++      // Pass v128 in FP registers (Simd128 kind). On PPC64 ELFv2, SIMD
++      // values use the same VSR register file as FP args.
++      if (floatRegIndex_ == NumFloatArgRegs) {
++        current_ = ABIArg(stackOffset_);
++        stackOffset_ += 16;
++        break;
++      }
++      current_ = ABIArg(FloatRegister(
++          FloatRegisters::Encoding(FloatRegisters::f1 + floatRegIndex_),
++          FloatRegisters::Simd128));
++      floatRegIndex_++;
++      if (intRegIndex_ < NumIntArgRegs) {
++        intRegIndex_++;
++      }
++      break;
++    }
++    default:
++      MOZ_CRASH("Unexpected argument type");
++  }
++  return current_;
++}
++
++// Condition inversion tables.
++Assembler::Condition Assembler::InvertCondition(Condition cond) {
++  switch (cond) {
++    case Equal:
++      return NotEqual;
++    case NotEqual:
++      return Equal;
++    case LessThan:
++      return GreaterThanOrEqual;
++    case LessThanOrEqual:
++      return GreaterThan;
++    case GreaterThan:
++      return LessThanOrEqual;
++    case GreaterThanOrEqual:
++      return LessThan;
++    case Above:
++      return BelowOrEqual;
++    case AboveOrEqual:
++      return Below;
++    case Below:
++      return AboveOrEqual;
++    case BelowOrEqual:
++      return Above;
++    case Zero:
++      return NonZero;
++    case NonZero:
++      return Zero;
++    case Signed:
++      return NotSigned;
++    case NotSigned:
++      return Signed;
++    case SOBit:
++      return NSOBit;
++    case NSOBit:
++      return SOBit;
++    case Overflow:
++      return NotOverflow;
++    case NotOverflow:
++      return Overflow;
++    case CarrySet:
++      return CarryClear;
++    case CarryClear:
++      return CarrySet;
++    default:
++      MOZ_CRASH("unexpected condition");
++  }
++}
++
++Assembler::DoubleCondition Assembler::InvertCondition(DoubleCondition cond) {
++  switch (cond) {
++    case DoubleOrdered:
++      return DoubleUnordered;
++    case DoubleEqual:
++      return DoubleNotEqualOrUnordered;
++    case DoubleNotEqual:
++      return DoubleEqualOrUnordered;
++    case DoubleGreaterThan:
++      return DoubleLessThanOrEqualOrUnordered;
++    case DoubleGreaterThanOrEqual:
++      return DoubleLessThanOrUnordered;
++    case DoubleLessThan:
++      return DoubleGreaterThanOrEqualOrUnordered;
++    case DoubleLessThanOrEqual:
++      return DoubleGreaterThanOrUnordered;
++    case DoubleUnordered:
++      return DoubleOrdered;
++    case DoubleEqualOrUnordered:
++      return DoubleNotEqual;
++    case DoubleNotEqualOrUnordered:
++      return DoubleEqual;
++    case DoubleGreaterThanOrUnordered:
++      return DoubleLessThanOrEqual;
++    case DoubleGreaterThanOrEqualOrUnordered:
++      return DoubleLessThan;
++    case DoubleLessThanOrUnordered:
++      return DoubleGreaterThanOrEqual;
++    case DoubleLessThanOrEqualOrUnordered:
++      return DoubleGreaterThan;
++    default:
++      MOZ_CRASH("unexpected condition");
++  }
++}
++
++// InstImm helper.
++uint8_t InstImm::traptag() {
++  uint8_t r = ((data & 0x001f0000) >> 16);
++  MOZ_ASSERT(isOpcode(PPC_tw));
++  MOZ_ASSERT(r == ((data & 0x0000f800) >> 11));
++  return r & 0xfe;
++}
++
++BOffImm16::BOffImm16(InstImm inst) : data(inst.extractImm16Value() & 0xFFFC) {
++  // Sign-extend the 16-bit field.
++  if (data & 0x8000) {
++    data |= ~0xFFFF;
++  }
++}
++
++Instruction* BOffImm16::getDest(Instruction* src) const {
++  return (Instruction*)((uint8_t*)src + data);
++}
++
++Instruction* JOffImm26::getDest(Instruction* src) const {
++  return (Instruction*)((uint8_t*)src + data);
++}
++
++Imm16::Imm16() : value(0) {}
++
++Imm8::Imm8() : value(0) {}
++
++// Buffer management.
++bool Assembler::oom() const {
++  return AssemblerShared::oom() || m_buffer.oom() || jumpRelocations_.oom() ||
++         dataRelocations_.oom();
++}
++
++void Assembler::finish() {
++  MOZ_ASSERT(!isFinished);
++  isFinished = true;
++  m_buffer.flushPool();
++}
++
++bool Assembler::appendRawCode(const uint8_t* code, size_t numBytes) {
++  return m_buffer.appendRawCode(code, numBytes);
++}
++
++bool Assembler::reserve(size_t size) {
++  // Fixed-size chunk buffer; no point in reserving now vs. on-demand.
++  return !oom();
++}
++
++bool Assembler::swapBuffer(wasm::Bytes& bytes) {
++  MOZ_ASSERT(bytes.empty());
++  if (!bytes.resize(bytesNeeded())) {
++    return false;
++  }
++  m_buffer.executableCopy(bytes.begin());
++  return true;
++}
++
++void Assembler::copyJumpRelocationTable(uint8_t* dest) {
++  if (jumpRelocations_.length()) {
++    memcpy(dest, jumpRelocations_.buffer(), jumpRelocations_.length());
++  }
++}
++
++void Assembler::copyDataRelocationTable(uint8_t* dest) {
++  if (dataRelocations_.length()) {
++    memcpy(dest, dataRelocations_.buffer(), dataRelocations_.length());
++  }
++}
++
++void Assembler::executableCopy(void* buffer) {
++  MOZ_ASSERT(isFinished);
++  m_buffer.executableCopy(static_cast<uint8_t*>(buffer));
++}
++
++void Assembler::executableCopy(uint8_t* buffer) {
++  MOZ_ASSERT(isFinished);
++  m_buffer.executableCopy(buffer);
++}
++
++size_t Assembler::size() const {
++  // AssemblerBufferWithConstantPools::size() asserts pool is empty.
++  // Flush pending pool entries first.
++  const_cast<PPCBufferWithExecutableCopy&>(m_buffer).flushPool();
++  return m_buffer.size();
++}
++
++size_t Assembler::jumpRelocationTableBytes() const {
++  return jumpRelocations_.length();
++}
++
++size_t Assembler::dataRelocationTableBytes() const {
++  return dataRelocations_.length();
++}
++
++size_t Assembler::bytesNeeded() const {
++  return size() + jumpRelocationTableBytes() + dataRelocationTableBytes();
++}
++
++// Write an instruction into the buffer or to an external destination.
++BufferOffset Assembler::writeInst(uint32_t x, uint32_t* dest) {
++  MOZ_ASSERT(hasCreator());
++  if (dest == nullptr) {
++    return m_buffer.putInt(x);
++  }
++
++  WriteInstStatic(x, dest);
++  return BufferOffset();
++}
++
++void Assembler::WriteInstStatic(uint32_t x, uint32_t* dest) {
++  MOZ_ASSERT(dest != nullptr);
++  *dest = x;
++}
++
++// Alignment.
++BufferOffset Assembler::haltingAlign(int alignment) {
++  BufferOffset ret;
++  MOZ_ASSERT(m_buffer.isAligned(4));
++  if (alignment == 8) {
++    if (!m_buffer.isAligned(alignment)) {
++      BufferOffset tmp = xs_trap();
++      if (!ret.assigned()) {
++        ret = tmp;
++      }
++    }
++  } else {
++    MOZ_ASSERT((alignment & (alignment - 1)) == 0);
++    while (size() & (alignment - 1)) {
++      BufferOffset tmp = xs_trap();
++      if (!ret.assigned()) {
++        ret = tmp;
++      }
++    }
++  }
++  return ret;
++}
++
++BufferOffset Assembler::nopAlign(int alignment) {
++  BufferOffset ret;
++  MOZ_ASSERT(m_buffer.isAligned(4));
++  if (alignment == 8) {
++    if (!m_buffer.isAligned(alignment)) {
++      BufferOffset tmp = as_nop();
++      if (!ret.assigned()) {
++        ret = tmp;
++      }
++    }
++  } else {
++    MOZ_ASSERT((alignment & (alignment - 1)) == 0);
++    while (size() & (alignment - 1)) {
++      BufferOffset tmp = as_nop();
++      if (!ret.assigned()) {
++        ret = tmp;
++      }
++    }
++  }
++  return ret;
++}
++
++// Primitive instructions.
++BufferOffset Assembler::as_nop() {
++  spew("nop");
++  return writeInst(PPC_nop);
++}
++
++BufferOffset Assembler::as_lwsync() {
++  spew("lwsync");
++  return writeInst(PPC_lwsync);
++}
++
++BufferOffset Assembler::as_sync() {
++  spew("sync");
++  return writeInst(PPC_sync);
++}
++
++BufferOffset Assembler::as_isync() {
++  spew("isync");
++  return writeInst(PPC_isync);
++}
++
++// Branch and jump instructions.
++BufferOffset Assembler::as_b(JOffImm26 off, BranchAddressType bat, LinkBit lb) {
++  return as_b(off.encode(), bat, lb);
++}
++
++BufferOffset Assembler::as_b(int32_t off, BranchAddressType bat, LinkBit lb) {
++  spew("b%s%s\t%x", bat == AbsoluteBranch ? "a" : "", lb ? "l" : "", off);
++  MOZ_ASSERT(!(off & 0x03));
++  return writeInst(PPC_b | ((uint32_t)off & 0x3fffffc) | bat | lb);
++}
++
++BufferOffset Assembler::as_blr(LinkBit lb) {
++  spew("blr%s", lb ? "l" : "");
++  return writeInst(uint32_t(PPC_blr) | uint32_t(lb));
++}
++
++BufferOffset Assembler::as_bctr(LinkBit lb) {
++  spew("bctr%s", lb ? "l" : "");
++  return writeInst(uint32_t(PPC_bctr) | uint32_t(lb));
++}
++
++// Conditional branches.
++BufferOffset Assembler::as_bc(BOffImm16 off, Condition cond, CRegisterID cr,
++                              LikelyBit lkb, LinkBit lb) {
++  return as_bc(off.encode(), cond, cr, lkb, lb);
++}
++
++BufferOffset Assembler::as_bc(int16_t off, Condition cond, CRegisterID cr,
++                              LikelyBit lkb, LinkBit lb) {
++  return as_bc(off, computeConditionCode(cond, cr), lkb, lb);
++}
++
++BufferOffset Assembler::as_bc(BOffImm16 off, DoubleCondition cond,
++                              CRegisterID cr, LikelyBit lkb, LinkBit lb) {
++  return as_bc(off.encode(), cond, cr, lkb, lb);
++}
++
++BufferOffset Assembler::as_bc(int16_t off, DoubleCondition cond, CRegisterID cr,
++                              LikelyBit lkb, LinkBit lb) {
++  return as_bc(off, computeConditionCode(cond, cr), lkb, lb);
++}
++
++BufferOffset Assembler::as_bcctr(Condition cond, CRegisterID cr, LikelyBit lkb,
++                                 LinkBit lb) {
++  return as_bcctr(computeConditionCode(cond, cr), lkb, lb);
++}
++
++BufferOffset Assembler::as_bcctr(DoubleCondition cond, CRegisterID cr,
++                                 LikelyBit lkb, LinkBit lb) {
++  return as_bcctr(computeConditionCode(cond, cr), lkb, lb);
++}
++
++// Condition code computation: turn DoubleCondition + CR into BO|BI.
++// May emit CR logic instructions for synthetic conditions involving FU bit.
++uint16_t Assembler::computeConditionCode(DoubleCondition op, CRegisterID cr) {
++  const uint8_t condBit = crBit(cr, op);
++  const uint8_t fuBit = crBit(cr, DoubleUnordered);
++  uint32_t newop = (uint32_t)op & 255;
++
++  if (op & DoubleConditionUnordered) {
++    if ((uint32_t(op) & BranchOptionMask) == BranchOnClear) {
++      as_crorc(condBit, fuBit, condBit);
++      newop |= BranchOnSet;
++    } else {
++      if (condBit != fuBit) {
++        as_cror(condBit, fuBit, condBit);
++      }
++    }
++  } else {
++    if ((uint32_t(op) & BranchOptionMask) == BranchOnClear) {
++      if (condBit != fuBit) {
++        as_cror(condBit, fuBit, condBit);
++      }
++    } else {
++      if (condBit != fuBit) {
++        as_crandc(condBit, condBit, fuBit);
++      }
++    }
++  }
++
++  return (newop + ((uint8_t)cr << 6));
++}
++
++// Condition code computation: turn Condition + CR into BO|BI.
++// May emit mcrxrx for XER-mediated conditions.
++uint16_t Assembler::computeConditionCode(Condition op, CRegisterID cr) {
++  uint32_t newop = (uint32_t)op & 255;
++
++  if (op & ConditionOnlyXER) {
++    MOZ_ASSERT(op == Overflow || op == NotOverflow);
++    if (HasPOWER9()) {
++      as_mcrxrx(cr);
++    } else {
++      // POWER8: read XER, place OV into the GT position of the target
++      // CR field. Overflow condition (0x1c = GreaterThan) tests GT bit,
++      // which mcrxrx populates with OV32. For 64-bit ops OV == OV32.
++      // XER layout in GPR low 32 bits (IBM): bit 0=SO, 1=OV, 2=CA.
++      // Target: GT position = IBM bit 4*cr+1.
++      xs_mfxer(r0);
++      int gtBit = 4 * (int)cr + 1;          // GT position in CR field
++      int sh = (1 - gtBit) & 31;            // rotate OV from bit 1 to gtBit
++      as_rlwinm(r0, r0, sh, gtBit, gtBit);  // isolate OV at GT only
++      as_mtcrf(1 << (7 - (int)cr), r0);
++    }
++    newop = (uint32_t)op & 255;
++  }
++
++  return (newop + ((uint8_t)cr << 6));
++}
++
++// Given BO|BI in a 16-bit quantity, split into bit fields for instruction.
++static uint32_t makeOpMask(uint16_t op) {
++  MOZ_ASSERT(!(op & 0xfc00));
++  return ((op & 0x0f) << 21) | ((op & 0xfff0) << 12);
++}
++
++BufferOffset Assembler::as_bc(int16_t off, uint16_t op, LikelyBit lkb,
++                              LinkBit lb) {
++  spew("bc%s%s\tBO_BI=0x%04x,%d", lb ? "l" : "", lkb ? "+" : "", op, off);
++  MOZ_ASSERT(!(off & 0x03));
++  return writeInst(Instruction(PPC_bc | makeOpMask(op) | lkb << 21 |
++                               ((uint16_t)off & 0xfffc) | lb)
++                       .encode());
++}
++
++BufferOffset Assembler::as_bcctr(uint16_t op, LikelyBit lkb, LinkBit lb) {
++  spew("bcctr%s%s", lb ? "l" : "", lkb ? "+" : "");
++  return writeInst(PPC_bcctr | makeOpMask(op) | lkb << 21 | lb);
++}
++
++// SPR operations.
++BufferOffset Assembler::as_mtspr(SPRegisterID spr, Register ra) {
++  spew("mtspr\t%d,%3s", spr, ra.name());
++  return writeInst(PPC_mtspr | ra.code() << 21 | PPC_SPR(spr));
++}
++
++BufferOffset Assembler::as_mfspr(Register rd, SPRegisterID spr) {
++  spew("mfspr\t%3s,%d", rd.name(), spr);
++  return writeInst(PPC_mfspr | rd.code() << 21 | PPC_SPR(spr));
++}
++
++// CR operations.
++#define DEF_CRCR(op)                                                 \
++  BufferOffset Assembler::as_##op(uint8_t t, uint8_t a, uint8_t b) { \
++    spew(#op "\t%d,%d,%d", t, a, b);                                 \
++    return writeInst(PPC_##op | t << 21 | a << 16 | b << 11);        \
++  }
++DEF_CRCR(crandc)
++DEF_CRCR(cror)
++DEF_CRCR(crorc)
++#undef DEF_CRCR
++
++BufferOffset Assembler::as_mtcrf(uint32_t mask, Register rs) {
++  spew("mtcrf\t%d,%3s", mask, rs.name());
++  return writeInst(PPC_mtcrf | rs.code() << 21 | mask << 12);
++}
++
++BufferOffset Assembler::as_mfocrf(Register rd, CRegisterID crfs) {
++  spew("mfocrf\t%3s,cr%d", rd.name(), crfs);
++  // FXM is a one-hot 8-bit mask at bits 12-19. Bit (7-crfs) selects the CR.
++  return writeInst(PPC_mfocrf | rd.code() << 21 | (1 << (7 - crfs)) << 12);
++}
++
++BufferOffset Assembler::as_mcrxrx(CRegisterID cr) {
++  spew("mcrxrx\tcr%d", cr);
++  return writeInst(PPC_mcrxrx | cr << 23);
++}
++
++// GPR neg.
++BufferOffset Assembler::as_neg(Register rd, Register rs) {
++  spew("neg\t%3s,%3s", rd.name(), rs.name());
++  return writeInst(InstReg(PPC_neg, rd, rs, r0).encode());
++}
++
++// Compare instructions.
++BufferOffset Assembler::as_cmpd(CRegisterID cr, Register ra, Register rb) {
++  spew("cmpd\tcr%d,%3s,%3s", cr, ra.name(), rb.name());
++  return writeInst(PPC_cmpd | cr << 23 | ra.code() << 16 | rb.code() << 11);
++}
++
++BufferOffset Assembler::as_cmpdi(CRegisterID cr, Register ra, int16_t im) {
++  spew("cmpdi\tcr%d,%3s,%d", cr, ra.name(), im);
++  return writeInst(PPC_cmpdi | cr << 23 | ra.code() << 16 |
++                   ((uint16_t)im & 0xffff));
++}
++
++BufferOffset Assembler::as_cmpld(CRegisterID cr, Register ra, Register rb) {
++  spew("cmpld\tcr%d,%3s,%3s", cr, ra.name(), rb.name());
++  return writeInst(PPC_cmpld | cr << 23 | ra.code() << 16 | rb.code() << 11);
++}
++
++BufferOffset Assembler::as_cmpldi(CRegisterID cr, Register ra, int16_t im) {
++  spew("cmpldi\tcr%d,%3s,%d", cr, ra.name(), im);
++  return writeInst(PPC_cmpldi | cr << 23 | ra.code() << 16 |
++                   ((uint16_t)im & 0xffff));
++}
++
++BufferOffset Assembler::as_cmpw(CRegisterID cr, Register ra, Register rb) {
++  spew("cmpw\tcr%d,%3s,%3s", cr, ra.name(), rb.name());
++  return writeInst(PPC_cmpw | cr << 23 | ra.code() << 16 | rb.code() << 11);
++}
++
++BufferOffset Assembler::as_cmpwi(CRegisterID cr, Register ra, int16_t im) {
++  spew("cmpwi\tcr%d,%3s,%d", cr, ra.name(), im);
++  return writeInst(PPC_cmpwi | cr << 23 | ra.code() << 16 |
++                   ((uint16_t)im & 0xffff));
++}
++
++BufferOffset Assembler::as_cmplw(CRegisterID cr, Register ra, Register rb) {
++  spew("cmplw\tcr%d,%3s,%3s", cr, ra.name(), rb.name());
++  return writeInst(PPC_cmplw | cr << 23 | ra.code() << 16 | rb.code() << 11);
++}
++
++BufferOffset Assembler::as_cmplwi(CRegisterID cr, Register ra, int16_t im) {
++  spew("cmplwi\tcr%d,%3s,%d", cr, ra.name(), im);
++  return writeInst(PPC_cmplwi | cr << 23 | ra.code() << 16 |
++                   ((uint16_t)im & 0xffff));
++}
++
++// Compare instructions (cr0 implicit).
++BufferOffset Assembler::as_cmpd(Register ra, Register rb) {
++  spew("cmpd\t%3s,%3s", ra.name(), rb.name());
++  return writeInst(PPC_cmpd | ra.code() << 16 | rb.code() << 11);
++}
++
++BufferOffset Assembler::as_cmpdi(Register ra, int16_t im) {
++  spew("cmpdi\t%3s,%d", ra.name(), im);
++  return writeInst(PPC_cmpdi | ra.code() << 16 | ((uint16_t)im & 0xffff));
++}
++
++BufferOffset Assembler::as_cmpld(Register ra, Register rb) {
++  spew("cmpld\t%3s,%3s", ra.name(), rb.name());
++  return writeInst(PPC_cmpld | ra.code() << 16 | rb.code() << 11);
++}
++
++BufferOffset Assembler::as_cmpldi(Register ra, int16_t im) {
++  spew("cmpldi\t%3s,%d", ra.name(), im);
++  return writeInst(PPC_cmpldi | ra.code() << 16 | ((uint16_t)im & 0xffff));
++}
++
++BufferOffset Assembler::as_cmpw(Register ra, Register rb) {
++  spew("cmpw\t%3s,%3s", ra.name(), rb.name());
++  return writeInst(PPC_cmpw | ra.code() << 16 | rb.code() << 11);
++}
++
++BufferOffset Assembler::as_cmpwi(Register ra, int16_t im) {
++  spew("cmpwi\t%3s,%d", ra.name(), im);
++  return writeInst(PPC_cmpwi | ra.code() << 16 | ((uint16_t)im & 0xffff));
++}
++
++BufferOffset Assembler::as_cmplw(Register ra, Register rb) {
++  spew("cmplw\t%3s,%3s", ra.name(), rb.name());
++  return writeInst(PPC_cmplw | ra.code() << 16 | rb.code() << 11);
++}
++
++BufferOffset Assembler::as_cmplwi(Register ra, int16_t im) {
++  spew("cmplwi\t%3s,%d", ra.name(), im);
++  return writeInst(PPC_cmplwi | ra.code() << 16 | ((uint16_t)im & 0xffff));
++}
++
++// FP encoding helpers.
++static uint32_t AForm(uint32_t op, FloatRegister frt, FloatRegister fra,
++                      FloatRegister frb, FloatRegister frc, bool rc) {
++  return (op | (frt.encoding() << 21) | (fra.encoding() << 16) |
++          (frb.encoding() << 11) | (frc.encoding() << 6) | rc);
++}
++
++static uint32_t XForm(uint32_t op, FloatRegister frt, FloatRegister fra,
++                      FloatRegister frb, bool rc) {
++  return (op | (frt.encoding() << 21) | (fra.encoding() << 16) |
++          (frb.encoding() << 11) | rc);
++}
++
++static uint32_t XForm(uint32_t op, FloatRegister frt, Register ra, Register rb,
++                      bool rc) {
++  return (op | (frt.encoding() << 21) | (ra.code() << 16) | (rb.code() << 11) |
++          rc);
++}
++
++static uint32_t DForm(uint32_t op, FloatRegister frt, Register ra,
++                      int16_t imm) {
++  return (op | (frt.encoding() << 21) | (ra.code() << 16) |
++          ((uint16_t)imm & 0xffff));
++}
++
++// XX-form encoders. Each form has its own X-bit positions.
++// All take uint32_t encodings (0-63) so they correctly
++// emit the high bit for VSR32-63. FloatRegister.encoding() returns 0-31
++// for Single/Double (= VSR0-31 = FPR namespace) and 32-63 for Simd128
++// (= VSR32-63 = VR namespace) — so a single XX-form encoder addresses
++// the full VSR space.
++
++// XX1-form: T + GPR (RA) + GPR (RB). TX bit at instruction bit 0.
++// Used by lxvx, stxvx, lxvd2x, stxvd2x, mtvsrdd, mtvsrd, mtvsrws, mtvsrwz.
++static uint32_t XX1Form(uint32_t op, uint32_t xt, uint32_t ra, uint32_t rb) {
++  return op | (xt & 31) << 21 | (ra & 31) << 16 | (rb & 31) << 11 |
++         ((xt >> 5) & 1);
++}
++
++// XX1-form for mfvsrX: GPR (RT) + VSR (XS). TX bit ("SX") at instruction
++// bit 0; the X spec calls this SX since the source register is the VSR.
++// Used by mfvsrd, mfvsrld.
++static uint32_t XX1FormMfvsr(uint32_t op, uint32_t rt, uint32_t xs) {
++  return op | (xs & 31) << 21 | (rt & 31) << 16 | ((xs >> 5) & 1);
++}
++
++// XX2-form: T + B (no A field; bits 16-20 unused or hold a UIM). BX bit
++// at instruction bit 1, TX bit at instruction bit 0. The bits16-20 slot
++// is set by callers — for plain XX2 it must be 0, for XX2 with UIM it
++// holds the immediate.
++// Used by xxbrd, xxbrh, xxbrw, xxbrq, xscvdpsp, xscvspdp, xscvdpspn,
++// xscvspdpn, xxspltw (UIM=2 bits), xxinsertw (UIM=4 bits),
++// xxextractuw (UIM=4 bits), xvabs*/xvneg*/xvsqrt*/xvr* etc. via
++// DEF_VSX_UN.
++static uint32_t XX2Form(uint32_t op, uint32_t xt, uint32_t xb,
++                        uint32_t bits16to20 = 0) {
++  return op | (xt & 31) << 21 | (bits16to20 & 31) << 16 | (xb & 31) << 11 |
++         ((xb >> 5) & 1) << 1 | ((xt >> 5) & 1);
++}
++
++// XX3-form: T + A + B. AX/BX/TX bits at instruction bits 2/1/0.
++// Used by xxlor, xxland, xxlxor, xxlnor, xxlandc, xxpermdi, xsmaxjdp,
++// xsminjdp, xvadd*, xvcmp*, etc.
++static uint32_t XX3Form(uint32_t op, uint32_t xt, uint32_t xa, uint32_t xb) {
++  return op | (xt & 31) << 21 | (xa & 31) << 16 | (xb & 31) << 11 |
++         ((xa >> 5) & 1) << 2 | ((xb >> 5) & 1) << 1 | ((xt >> 5) & 1);
++}
++
++// XX4-form: T + A + B + C. CX/AX/BX/TX bits at instruction bits 3/2/1/0.
++// Used by xxsel.
++static uint32_t XX4Form(uint32_t op, uint32_t xt, uint32_t xa, uint32_t xb,
++                        uint32_t xc) {
++  return op | (xt & 31) << 21 | (xa & 31) << 16 | (xb & 31) << 11 |
++         (xc & 31) << 6 | ((xc >> 5) & 1) << 3 | ((xa >> 5) & 1) << 2 |
++         ((xb >> 5) & 1) << 1 | ((xt >> 5) & 1);
++}
++
++// FloatRegister convenience overload for XX3Form (the most common form).
++static uint32_t XX3Form(uint32_t op, FloatRegister xt, FloatRegister xa,
++                        FloatRegister xb) {
++  return XX3Form(op, uint32_t(xt.encoding()), uint32_t(xa.encoding()),
++                 uint32_t(xb.encoding()));
++}
++
++// --- Macro-defined instruction emitters ---
++
++// X-form: rd in bits 21-25, ra in 16-20, rb in 11-15.
++#define DEF_XFORM(op)                                                      \
++  BufferOffset Assembler::as_##op(Register rd, Register ra, Register rb) { \
++    spew(#op "\t%3s,%3s,%3s", rd.name(), ra.name(), rb.name());            \
++    return writeInst(InstReg(PPC_##op, rd, ra, rb).encode());              \
++  }
++
++#define DEF_XFORM_RC(op)                                            \
++  BufferOffset Assembler::as_##op##_rc(Register rd, Register ra,    \
++                                       Register rb) {               \
++    spew(#op ".\t%3s,%3s,%3s", rd.name(), ra.name(), rb.name());    \
++    return writeInst(InstReg(PPC_##op, rd, ra, rb).encode() | 0x1); \
++  }
++
++// X-form with swapped RS/RA encoding: rs in bits 21-25, ra in 16-20.
++#define DEF_XFORMS(op)                                                     \
++  BufferOffset Assembler::as_##op(Register rd, Register ra, Register rb) { \
++    spew(#op "\t%3s,%3s,%3s", rd.name(), ra.name(), rb.name());            \
++    return writeInst(InstReg(PPC_##op, ra, rd, rb).encode());              \
++  }
++
++#define DEF_XFORMS_RC(op)                                           \
++  BufferOffset Assembler::as_##op##_rc(Register rd, Register ra,    \
++                                       Register rb) {               \
++    spew(#op ".\t%3s,%3s,%3s", rd.name(), ra.name(), rb.name());    \
++    return writeInst(InstReg(PPC_##op, ra, rd, rb).encode() | 0x1); \
++  }
++
++// X-form shift immediate with swapped encoding.
++#define DEF_XFORMS_I(op)                                                       \
++  BufferOffset Assembler::as_##op(Register rd, Register ra, uint8_t sh) {      \
++    spew(#op "\t%3s,%3s,%d", rd.name(), ra.name(), sh);                        \
++    MOZ_ASSERT(sh < 32);                                                       \
++    return writeInst(PPC_##op | ra.code() << 21 | rd.code() << 16 | sh << 11); \
++  }
++
++// 2-reg X-form: rd in bits 21-25, ra in 16-20, rb=r0.
++#define DEF_XFORM2(op)                                        \
++  BufferOffset Assembler::as_##op(Register rd, Register ra) { \
++    spew(#op "\t%3s,%3s", rd.name(), ra.name());              \
++    return writeInst(InstReg(PPC_##op, rd, ra, r0).encode()); \
++  }
++
++#define DEF_XFORM2_RC(op)                                           \
++  BufferOffset Assembler::as_##op##_rc(Register rd, Register ra) {  \
++    spew(#op ".\t%3s,%3s", rd.name(), ra.name());                   \
++    return writeInst(InstReg(PPC_##op, rd, ra, r0).encode() | 0x1); \
++  }
++
++// 2-reg X-form swapped: ra in bits 21-25, rd in 16-20.
++#define DEF_XFORM2S(op)                                       \
++  BufferOffset Assembler::as_##op(Register rd, Register ra) { \
++    spew(#op "\t%3s,%3s", rd.name(), ra.name());              \
++    return writeInst(InstReg(PPC_##op, ra, rd, r0).encode()); \
++  }
++
++#define DEF_XFORM2S_RC(op)                                          \
++  BufferOffset Assembler::as_##op##_rc(Register rd, Register ra) {  \
++    spew(#op ".\t%3s,%3s", rd.name(), ra.name());                   \
++    return writeInst(InstReg(PPC_##op, ra, rd, r0).encode() | 0x1); \
++  }
++
++// D-form load/store: rd=RT, rb=RA (base register), off=displacement.
++// r0 cannot be used as base register for D-form loads/stores.
++#define DEF_DFORM(op)                                                      \
++  BufferOffset Assembler::as_##op(Register rd, Register rb, int16_t off) { \
++    spew(#op "\t%3s,%d(%3s)", rd.name(), off, rb.name());                  \
++    MOZ_ASSERT(rb != r0);                                                  \
++    return writeInst(InstImm(PPC_##op, rd, rb, off).encode());             \
++  }
++
++// D-form with swapped RS/RA encoding for logical immediates.
++#define DEF_DFORMS(op)                                                     \
++  BufferOffset Assembler::as_##op(Register rd, Register ra, uint16_t im) { \
++    spew(#op "\t%3s,%3s,%d", rd.name(), ra.name(), im);                    \
++    return writeInst(InstImm(PPC_##op, ra, rd, im).encode());              \
++  }
++
++// M-form: rotate with 3 registers + mb + me.
++#define DEF_MFORM(op)                                                         \
++  BufferOffset Assembler::as_##op(Register rd, Register rs, Register rb,      \
++                                  uint8_t mb, uint8_t me) {                   \
++    spew(#op "\t%3s,%3s,%3s,%d,%d", rd.name(), rs.name(), rb.name(), mb, me); \
++    MOZ_ASSERT(mb < 32);                                                      \
++    MOZ_ASSERT(me < 32);                                                      \
++    return writeInst(PPC_##op | rs.code() << 21 | rd.code() << 16 |           \
++                     rb.code() << 11 | mb << 6 | me << 1);                    \
++  }
++
++// M-form with immediate shift.
++#define DEF_MFORM_I(op)                                                        \
++  BufferOffset Assembler::as_##op(Register rd, Register rs, uint8_t sh,        \
++                                  uint8_t mb, uint8_t me) {                    \
++    spew(#op "\t%3s,%3s,%d,%d,%d", rd.name(), rs.name(), sh, mb, me);          \
++    MOZ_ASSERT(sh < 32);                                                       \
++    MOZ_ASSERT(mb < 32);                                                       \
++    MOZ_ASSERT(me < 32);                                                       \
++    return writeInst(PPC_##op | rs.code() << 21 | rd.code() << 16 | sh << 11 | \
++                     mb << 6 | me << 1);                                       \
++  }
++
++#define DEF_MFORM_I_RC(op)                                                     \
++  BufferOffset Assembler::as_##op##_rc(Register rd, Register rs, uint8_t sh,   \
++                                       uint8_t mb, uint8_t me) {               \
++    spew(#op ".\t%3s,%3s,%d,%d,%d", rd.name(), rs.name(), sh, mb, me);         \
++    MOZ_ASSERT(sh < 32);                                                       \
++    MOZ_ASSERT(mb < 32);                                                       \
++    MOZ_ASSERT(me < 32);                                                       \
++    return writeInst(PPC_##op | rs.code() << 21 | rd.code() << 16 | sh << 11 | \
++                     mb << 6 | me << 1 | 1);                                   \
++  }
++
++// MDS-form: rotate with register + mb (64-bit).
++#define DEF_MDSFORM(op)                                                   \
++  BufferOffset Assembler::as_##op(Register ra, Register rs, Register rb,  \
++                                  uint8_t mb) {                           \
++    spew(#op "\t%3s,%3s,%3s,%d", ra.name(), rs.name(), rb.name(), mb);    \
++    MOZ_ASSERT(mb < 64);                                                  \
++    return writeInst(PPC_##op | rs.code() << 21 | ra.code() << 16 |       \
++                     rb.code() << 11 | ((mb & 0x1f) << 6) | (mb & 0x20)); \
++  }
++
++#define DEF_MDSFORM_RC(op)                                                    \
++  BufferOffset Assembler::as_##op##_rc(Register ra, Register rs, Register rb, \
++                                       uint8_t mb) {                          \
++    spew(#op ".\t%3s,%3s,%3s,%d", ra.name(), rs.name(), rb.name(), mb);       \
++    MOZ_ASSERT(mb < 64);                                                      \
++    return writeInst(PPC_##op | rs.code() << 21 | ra.code() << 16 |           \
++                     rb.code() << 11 | ((mb & 0x1f) << 6) | (mb & 0x20) | 1); \
++  }
++
++// MD-form: rotate/shift with immediate sh + mb (64-bit).
++// sh and mb are 6-bit fields split across the instruction word.
++#define DEF_MDFORM(op)                                                        \
++  BufferOffset Assembler::as_##op(Register ra, Register rs, uint8_t sh,       \
++                                  uint8_t mb) {                               \
++    spew(#op "\t%3s,%3s,%d,%d", ra.name(), rs.name(), sh, mb);                \
++    MOZ_ASSERT(sh < 64);                                                      \
++    MOZ_ASSERT(mb < 64);                                                      \
++    return writeInst(PPC_##op | rs.code() << 21 | ra.code() << 16 |           \
++                     ((sh & 0x1f) << 11) | ((mb & 0x1f) << 6) | (mb & 0x20) | \
++                     ((sh & 0x20) >> 4));                                     \
++  }
++
++#define DEF_MDFORM_RC(op)                                                     \
++  BufferOffset Assembler::as_##op##_rc(Register ra, Register rs, uint8_t sh,  \
++                                       uint8_t mb) {                          \
++    spew(#op ".\t%3s,%3s,%d,%d", ra.name(), rs.name(), sh, mb);               \
++    MOZ_ASSERT(sh < 64);                                                      \
++    MOZ_ASSERT(mb < 64);                                                      \
++    return writeInst(PPC_##op | rs.code() << 21 | ra.code() << 16 |           \
++                     ((sh & 0x1f) << 11) | ((mb & 0x1f) << 6) | (mb & 0x20) | \
++                     ((sh & 0x20) >> 4) | 0x01);                              \
++  }
++
++// FP 2-reg X-form: frt in bits 21-25, fra=f0, frb in 11-15.
++#define DEF_XFORM2_F(op)                                                \
++  BufferOffset Assembler::as_##op(FloatRegister rd, FloatRegister ra) { \
++    spew(#op "\t%3s,%3s", rd.name(), ra.name());                        \
++    return writeInst(XForm(PPC_##op, rd, f0, ra, false));               \
++  }
++
++#define DEF_XFORM2_F_RC(op)                                                  \
++  BufferOffset Assembler::as_##op##_rc(FloatRegister rd, FloatRegister ra) { \
++    spew(#op ".\t%3s,%3s", rd.name(), ra.name());                            \
++    return writeInst(XForm(PPC_##op, rd, f0, ra, true));                     \
++  }
++
++// FP A-form with frc (fmul-type): frt, fra, frc; frb=f0.
++#define DEF_AFORM_C(op)                                               \
++  BufferOffset Assembler::as_##op(FloatRegister rd, FloatRegister ra, \
++                                  FloatRegister rc) {                 \
++    spew(#op "\t%3s,%3s,%3s", rd.name(), ra.name(), rc.name());       \
++    return writeInst(AForm(PPC_##op, rd, ra, f0, rc, false));         \
++  }
++
++#define DEF_AFORM_C_RC(op)                                                 \
++  BufferOffset Assembler::as_##op##_rc(FloatRegister rd, FloatRegister ra, \
++                                       FloatRegister rc) {                 \
++    spew(#op ".\t%3s,%3s,%3s", rd.name(), ra.name(), rc.name());           \
++    return writeInst(AForm(PPC_##op, rd, ra, f0, rc, true));               \
++  }
++
++// FP A-form with frb (fadd-type): frt, fra, frb; frc=f0.
++#define DEF_AFORM_B(op)                                               \
++  BufferOffset Assembler::as_##op(FloatRegister rd, FloatRegister ra, \
++                                  FloatRegister rb) {                 \
++    spew(#op "\t%3s,%3s,%3s", rd.name(), ra.name(), rb.name());       \
++    return writeInst(AForm(PPC_##op, rd, ra, rb, f0, false));         \
++  }
++
++#define DEF_AFORM_B_RC(op)                                                 \
++  BufferOffset Assembler::as_##op##_rc(FloatRegister rd, FloatRegister ra, \
++                                       FloatRegister rb) {                 \
++    spew(#op ".\t%3s,%3s,%3s", rd.name(), ra.name(), rb.name());           \
++    return writeInst(AForm(PPC_##op, rd, ra, rb, f0, true));               \
++  }
++
++// Full FP A-form: frt, fra, frc, frb (fmadd-type).
++#define DEF_AFORM(op)                                                          \
++  BufferOffset Assembler::as_##op(FloatRegister rd, FloatRegister ra,          \
++                                  FloatRegister rc, FloatRegister rb) {        \
++    spew(#op "\t%3s,%3s,%3s,%3s", rd.name(), ra.name(), rc.name(), rb.name()); \
++    return writeInst(AForm(PPC_##op, rd, ra, rb, rc, false));                  \
++  }
++
++#define DEF_AFORM_RC(op)                                                     \
++  BufferOffset Assembler::as_##op##_rc(FloatRegister rd, FloatRegister ra,   \
++                                       FloatRegister rc, FloatRegister rb) { \
++    spew(#op ".\t%3s,%3s,%3s,%3s", rd.name(), ra.name(), rc.name(),          \
++         rb.name());                                                         \
++    return writeInst(AForm(PPC_##op, rd, ra, rb, rc, true));                 \
++  }
++
++// FP D-form load/store.
++#define DEF_DFORM_F(op)                                          \
++  BufferOffset Assembler::as_##op(FloatRegister rd, Register rb, \
++                                  int16_t off) {                 \
++    spew(#op "\t%3s,%d(%3s)", rd.name(), off, rb.name());        \
++    MOZ_ASSERT(rb != r0);                                        \
++    return writeInst(DForm(PPC_##op, rd, rb, off));              \
++  }
++
++// FP X-form indexed load/store.
++#define DEF_FMEMx(op)                                            \
++  BufferOffset Assembler::as_##op(FloatRegister rd, Register ra, \
++                                  Register rb) {                 \
++    spew(#op "\t%3s,%3s,%3s", rd.name(), ra.name(), rb.name());  \
++    return writeInst(XForm(PPC_##op, rd, ra, rb, false));        \
++  }
++
++// --- Rotate/shift instructions ---
++
++DEF_MFORM(rlwnm)
++DEF_MFORM_I(rlwinm)
++DEF_MFORM_I_RC(rlwinm)
++DEF_MFORM_I(rlwimi)
++DEF_XFORMS_I(srawi)
++
++DEF_MDSFORM(rldcl)
++DEF_MDFORM(rldicl)
++DEF_MDFORM_RC(rldicl)
++DEF_MDFORM(rldicr)
++DEF_MDFORM_RC(rldicr)
++DEF_MDFORM(rldimi)
++
++BufferOffset Assembler::as_sradi(Register rd, Register rs, int sh) {
++  spew("sradi\t%3s,%3s,%d", rd.name(), rs.name(), sh);
++  MOZ_ASSERT(sh >= 0 && sh < 64);
++  return writeInst(PPC_sradi | rd.code() << 16 | rs.code() << 21 |
++                   (sh & 0x1f) << 11 | (sh & 0x20) >> 4);
++}
++
++// --- ALU three-register ---
++
++#define DEF_ALU2(op) DEF_XFORM(op)
++
++DEF_ALU2(add)
++DEF_ALU2(addc)
++DEF_ALU2(adde)
++DEF_ALU2(subf)
++DEF_ALU2(subfc)
++DEF_ALU2(subfe)
++DEF_ALU2(divd)
++DEF_ALU2(divdu)
++DEF_ALU2(divw)
++DEF_ALU2(divwu)
++// POWER9 modulo (XO-form, same encoding pattern as div).
++DEF_XFORM(modsd)
++DEF_XFORM(modsw)
++DEF_XFORM(modud)
++DEF_XFORM(moduw)
++DEF_ALU2(mulld)
++DEF_ALU2(mulhd)
++DEF_ALU2(mulhdu)
++DEF_ALU2(mulldo)
++DEF_ALU2(mullw)
++DEF_ALU2(mulhwu)
++#undef DEF_ALU2
++
++// --- ALU immediate ---
++
++// D-form ALU-immediate ops have no Rc bit at instruction LSB (that bit
++// is part of the 16-bit immediate). The only valid record-form variant
++// in this group is `addic.`, which is a separate primary opcode (13)
++// hand-written below; subfic and mulli have no record form at all.
++#define DEF_ALUI(op)                                                      \
++  BufferOffset Assembler::as_##op(Register rd, Register ra, int16_t im) { \
++    spew(#op "\t%3s,%3s,%d", rd.name(), ra.name(), im);                   \
++    return writeInst(InstImm(PPC_##op, rd, ra, im).encode());             \
++  }
++
++BufferOffset Assembler::as_addi(Register rd, Register ra, int16_t im,
++                                bool actually_li) {
++#ifdef DEBUG
++  if (actually_li) {
++    spew("li\t%3s,%d", rd.name(), im);
++  } else {
++    MOZ_ASSERT(ra != r0);
++    spew("addi\t%3s,%3s,%d", rd.name(), ra.name(), im);
++  }
++#endif
++  return writeInst(InstImm(PPC_addi, rd, ra, im).encode());
++}
++
++BufferOffset Assembler::as_addis(Register rd, Register ra, int16_t im,
++                                 bool actually_lis) {
++#ifdef DEBUG
++  if (actually_lis) {
++    spew("lis\t%3s,%d", rd.name(), im);
++  } else {
++    MOZ_ASSERT(ra != r0);
++    spew("addis\t%3s,%3s,%d", rd.name(), ra.name(), im);
++  }
++#endif
++  return writeInst(InstImm(PPC_addis, rd, ra, im).encode());
++}
++
++DEF_ALUI(mulli)
++DEF_ALUI(subfic)
++#undef DEF_ALUI
++
++// --- ALU unary/extended ---
++
++
++#define DEF_ALUE_S(op) DEF_XFORM2S(op)
++DEF_ALUE_S(cntlzw)
++DEF_ALUE_S(cntlzd)
++DEF_ALUE_S(cnttzd)
++DEF_ALUE_S(cnttzw)
++#undef DEF_ALUE_S
++
++DEF_XFORM2S(popcntd)
++DEF_XFORM2S(popcntw)
++DEF_XFORM2S(brd)  // POWER10
++DEF_XFORM2S(brh)  // POWER10
++DEF_XFORM2S(brw)  // POWER10
++
++// --- Bitwise logical (three-register) ---
++
++#define DEF_BITALU2(op) DEF_XFORMS(op)
++DEF_BITALU2(nor)
++DEF_BITALU2(slw)
++DEF_BITALU2(srw)
++DEF_BITALU2(sraw)
++DEF_BITALU2(sld)
++DEF_BITALU2(srd)
++DEF_BITALU2(srad)
++#undef DEF_BITALU2
++
++// and_, or_, xor_ are manually defined (trailing underscore to avoid C++
++// keyword conflicts). xs_mr delegates to as_or_ so we must not assert
++// rd==rs==rb in as_or_ (which would be a valid mr).
++BufferOffset Assembler::as_or_(Register rd, Register rs, Register rb) {
++  spew("or\t%3s,%3s,%3s", rd.name(), rs.name(), rb.name());
++  return writeInst(InstReg(PPC_or_, rs, rd, rb).encode());
++}
++
++BufferOffset Assembler::as_xor_(Register rd, Register rs, Register rb) {
++  spew("xor\t%3s,%3s,%3s", rd.name(), rs.name(), rb.name());
++  return writeInst(InstReg(PPC_xor_, rs, rd, rb).encode());
++}
++
++BufferOffset Assembler::as_and_(Register rd, Register rs, Register rb) {
++  spew("and\t%3s,%3s,%3s", rd.name(), rs.name(), rb.name());
++  return writeInst(InstReg(PPC_and_, rs, rd, rb).encode());
++}
++
++BufferOffset Assembler::as_and__rc(Register rd, Register rs, Register rb) {
++  spew("and.\t%3s,%3s,%3s", rd.name(), rs.name(), rb.name());
++  return writeInst(InstReg(PPC_and_, rs, rd, rb).encode() | 0x1);
++}
++
++// --- Bitwise logical (immediate) ---
++
++DEF_DFORMS(ori)
++DEF_DFORMS(oris)
++DEF_DFORMS(xori)
++DEF_DFORMS(xoris)
++
++BufferOffset Assembler::as_andi_rc(Register rd, Register ra, uint16_t im) {
++  spew("andi.\t%3s,%3s,%d", rd.name(), ra.name(), im);
++  return writeInst(InstImm(PPC_andi_dot, ra, rd, im).encode());
++}
++
++// --- Sign extension ---
++
++#define DEF_ALUEXT(op) DEF_XFORM2S(op) DEF_XFORM2S_RC(op)
++DEF_XFORM2S(extsb)
++DEF_XFORM2S(extsh)
++DEF_ALUEXT(extsw)
++#undef DEF_ALUEXT
++
++// --- Integer loads (D-form) ---
++
++DEF_DFORM(lbz)
++DEF_DFORM(lha)
++DEF_DFORM(lhz)
++
++BufferOffset Assembler::as_lwa(Register rd, Register rb, int16_t off) {
++  spew("lwa\t%3s,%d(%3s)", rd.name(), off, rb.name());
++  MOZ_ASSERT(rb != r0);
++  MOZ_ASSERT(!(off & 0x03));
++  return writeInst(InstImm(PPC_lwa, rd, rb, off).encode());
++}
++
++DEF_DFORM(lwz)
++
++BufferOffset Assembler::as_ld(Register rd, Register rb, int16_t off) {
++  spew("ld\t%3s,%d(%3s)", rd.name(), off, rb.name());
++  MOZ_ASSERT(rb != r0);
++  MOZ_ASSERT(!(off & 0x03));
++  return writeInst(InstImm(PPC_ld, rd, rb, off).encode());
++}
++
++// --- Integer stores (D-form) ---
++
++DEF_DFORM(stb)
++DEF_DFORM(sth)
++DEF_DFORM(stw)
++
++BufferOffset Assembler::as_std(Register rd, Register rb, int16_t off) {
++  spew("std\t%3s,%d(%3s)", rd.name(), off, rb.name());
++  MOZ_ASSERT(rb != r0);
++  MOZ_ASSERT(!(off & 0x03));
++  return writeInst(InstImm(PPC_std, rd, rb, off).encode());
++}
++
++DEF_DFORM(stdu)
++
++#undef DEF_DFORM
++#undef DEF_DFORMS
++
++// --- Integer loads/stores (X-form, indexed) ---
++
++#define DEF_MEMx(op) DEF_XFORM(op)
++DEF_MEMx(lbzx) DEF_MEMx(lhax) DEF_MEMx(lhzx) DEF_MEMx(lwax)
++    DEF_MEMx(lwzx) DEF_MEMx(lwarx) DEF_MEMx(lbarx)
++        DEF_MEMx(lharx) DEF_MEMx(ldx) DEF_MEMx(ldarx) DEF_MEMx(stbx)
++            DEF_MEMx(stbcx) DEF_MEMx(stwx) DEF_MEMx(stwbrx) DEF_MEMx(sthx)
++                DEF_MEMx(sthcx) DEF_MEMx(stdx) DEF_MEMx(stdcx)
++                    DEF_MEMx(stwcx)
++#undef DEF_MEMx
++
++// --- Integer select ---
++
++BufferOffset Assembler::as_isel(Register rt, Register ra, Register rb,
++                                uint16_t bc, CRegisterID cr) {
++  MOZ_ASSERT(ra != r0);
++  return as_isel0(rt, ra, rb, bc, cr);
++}
++
++BufferOffset Assembler::as_isel0(Register rt, Register ra, Register rb,
++                                 uint16_t bc, CRegisterID cr) {
++  spew("isel\t%3s,%3s,%3s,cr%d:0x%02x", rt.name(), ra.name(), rb.name(), cr,
++       bc);
++  MOZ_ASSERT((bc < 0x40) && ((bc & 0x0f) == 0x0c));
++  uint16_t nbc = (bc >> 4) + (cr << 2);
++  return writeInst(PPC_isel | rt.code() << 21 | ra.code() << 16 |
++                   rb.code() << 11 | nbc << 6);
++}
++
++BufferOffset Assembler::as_setbc(Register rt, uint16_t bc, CRegisterID cr) {
++  spew("setbc\t%3s,cr%d:0x%02x", rt.name(), cr, bc);
++  MOZ_ASSERT((bc < 0x40) && ((bc & 0x0f) == 0x0c));
++  uint16_t nbc = (bc >> 4) + (cr << 2);
++  return writeInst(PPC_setbc | (rt.code() << 21) | (nbc << 16));
++}
++
++BufferOffset Assembler::as_setbcr(Register rt, uint16_t bc, CRegisterID cr) {
++  spew("setbcr\t%3s,cr%d:0x%02x", rt.name(), cr, bc);
++  MOZ_ASSERT((bc < 0x40) && ((bc & 0x0f) == 0x0c));
++  uint16_t nbc = (bc >> 4) + (cr << 2);
++  return writeInst(PPC_setbcr | (rt.code() << 21) | (nbc << 16));
++}
++
++// --- FP compare ---
++
++BufferOffset Assembler::as_fcmpu(CRegisterID cr, FloatRegister ra,
++                                 FloatRegister rb) {
++  spew("fcmpu\tcr%d,%3s,%3s", cr, ra.name(), rb.name());
++  return writeInst(PPC_fcmpu | cr << 23 | ra.encoding() << 16 |
++                   rb.encoding() << 11);
++}
++
++BufferOffset Assembler::as_fcmpu(FloatRegister ra, FloatRegister rb) {
++  return as_fcmpu(cr0, ra, rb);
++}
++
++// --- FP arithmetic ---
++
++#define DEF_FPUAC(op) DEF_AFORM_C(op)
++DEF_FPUAC(fmul)
++DEF_FPUAC(fmuls)
++#undef DEF_FPUAC
++
++#define DEF_FPUAB(op) DEF_AFORM_B(op)
++DEF_FPUAB(fadd)
++DEF_FPUAB(fdiv)
++DEF_FPUAB(fsub)
++DEF_FPUAB(fadds)
++DEF_FPUAB(fdivs)
++DEF_FPUAB(fsubs)
++DEF_FPUAB(fcpsgn)
++#undef DEF_FPUAB
++
++// --- FP unary/conversion/rounding ---
++
++#define DEF_FPUDS(op) DEF_XFORM2_F(op)
++DEF_FPUDS(fabs)
++DEF_FPUDS(fneg)
++DEF_FPUDS(fmr)
++DEF_FPUDS(fcfid)
++DEF_FPUDS(fcfids)
++DEF_FPUDS(fcfidu)
++DEF_FPUDS(fcfidus)
++DEF_FPUDS(fctid)
++DEF_FPUDS(fctidz)
++DEF_FPUDS(fctiduz)
++DEF_FPUDS(fctiwz)
++DEF_FPUDS(frim)
++DEF_FPUDS(frip)
++DEF_FPUDS(friz)
++DEF_FPUDS(frsp)
++DEF_FPUDS(fsqrt)
++DEF_FPUDS(fsqrts)
++#undef DEF_FPUDS
++
++// --- FP loads/stores (D-form) ---
++
++DEF_DFORM_F(lfd)
++DEF_DFORM_F(lfs)
++DEF_DFORM_F(stfd)
++DEF_DFORM_F(stfs)
++DEF_DFORM_F(stfdu)
++DEF_DFORM_F(stfsu)
++
++// --- FP loads/stores (X-form, indexed) ---
++
++DEF_FMEMx(lfdx) DEF_FMEMx(lfsx) DEF_FMEMx(lfiwax)
++    DEF_FMEMx(stfdx) DEF_FMEMx(stfsx)
++// Clean up macros.
++#undef DEF_XFORM
++#undef DEF_XFORM_RC
++#undef DEF_XFORMS
++#undef DEF_XFORMS_RC
++#undef DEF_XFORMS_I
++#undef DEF_XFORM2
++#undef DEF_XFORM2_RC
++#undef DEF_XFORM2S
++#undef DEF_XFORM2S_RC
++#undef DEF_XFORM2_F
++#undef DEF_XFORM2_F_RC
++#undef DEF_MFORM
++#undef DEF_MFORM_I
++#undef DEF_MFORM_I_RC
++#undef DEF_MDSFORM
++#undef DEF_MDSFORM_RC
++#undef DEF_MDFORM
++#undef DEF_MDFORM_RC
++#undef DEF_DFORM_F
++#undef DEF_FMEMx
++#undef DEF_AFORM_C
++#undef DEF_AFORM_C_RC
++#undef DEF_AFORM_B
++#undef DEF_AFORM_B_RC
++#undef DEF_AFORM
++#undef DEF_AFORM_RC
++
++    // --- FPSCR operations ---
++
++    BufferOffset Assembler::as_mtfsb0(uint8_t bt) {
++  spew("mtfsb0\t%d", bt);
++  return writeInst(PPC_mtfsb0 | (uint32_t)bt << 21);
++}
++
++BufferOffset Assembler::as_mcrfs(CRegisterID bf, uint8_t bfa) {
++  spew("mcrfs\tcr%d,%d", bf, bfa);
++  return writeInst(PPC_mcrfs | (uint32_t)bf << 23 | (uint32_t)bfa << 18);
++}
++
++// --- VSX (FPR-only subset) ---
++
++BufferOffset Assembler::as_mfvsrd(Register ra, FloatRegister xs) {
++  spew("mfvsrd\t%3s,%3s", ra.name(), xs.name());
++  return writeInst(XX1FormMfvsr(PPC_mfvsrd, ra.code(), xs.encoding()));
++}
++
++BufferOffset Assembler::as_mtvsrd(FloatRegister xt, Register ra) {
++  spew("mtvsrd\t%3s,%3s", xt.name(), ra.name());
++  return writeInst(XX1Form(PPC_mtvsrd, xt.encoding(), ra.code(), 0));
++}
++
++BufferOffset Assembler::as_mtvsrwa(FloatRegister xt, Register ra) {
++  spew("mtvsrwa\t%3s,%3s", xt.name(), ra.name());
++  return writeInst(XX1Form(PPC_mtvsrwa, xt.encoding(), ra.code(), 0));
++}
++
++BufferOffset Assembler::as_mtvsrws(FloatRegister xt, Register ra) {
++  spew("mtvsrws\t%3s,%3s", xt.name(), ra.name());
++  return writeInst(XX1Form(PPC_mtvsrws, xt.encoding(), ra.code(), 0));
++}
++
++BufferOffset Assembler::as_mtvsrwz(FloatRegister xt, Register ra) {
++  spew("mtvsrwz\t%3s,%3s", xt.name(), ra.name());
++  return writeInst(XX1Form(PPC_mtvsrwz, xt.encoding(), ra.code(), 0));
++}
++
++BufferOffset Assembler::as_xxbrd(FloatRegister xt, FloatRegister xb) {
++  spew("xxbrd\t%3s,%3s", xt.name(), xb.name());
++  return writeInst(XX2Form(PPC_xxbrd, xt.encoding(), xb.encoding()));
++}
++
++BufferOffset Assembler::as_xscvdpspn(FloatRegister xt, FloatRegister xb) {
++  spew("xscvdpspn\t%3s,%3s", xt.name(), xb.name());
++  return writeInst(XX2Form(PPC_xscvdpspn, xt.encoding(), xb.encoding()));
++}
++
++BufferOffset Assembler::as_xscvspdpn(FloatRegister xt, FloatRegister xb) {
++  spew("xscvspdpn\t%3s,%3s", xt.name(), xb.name());
++  return writeInst(XX2Form(PPC_xscvspdpn, xt.encoding(), xb.encoding()));
++}
++
++// POWER9 (ISA 3.0) scalar FP16 conversions. The UIM disambiguator is
++// already in PPC_xscvdphp / PPC_xscvhpdp; XX2Form's bits16to20 default
++// of 0 leaves it intact.
++BufferOffset Assembler::as_xscvdphp(FloatRegister xt, FloatRegister xb) {
++  spew("xscvdphp\t%3s,%3s", xt.name(), xb.name());
++  return writeInst(XX2Form(PPC_xscvdphp, xt.encoding(), xb.encoding()));
++}
++
++BufferOffset Assembler::as_xscvhpdp(FloatRegister xt, FloatRegister xb) {
++  spew("xscvhpdp\t%3s,%3s", xt.name(), xb.name());
++  return writeInst(XX2Form(PPC_xscvhpdp, xt.encoding(), xb.encoding()));
++}
++
++BufferOffset Assembler::as_xsxexpdp(FloatRegister xt, FloatRegister xb) {
++  spew("xsxexpdp\t%3s,%3s", xt.name(), xb.name());
++  return writeInst(XX2Form(PPC_xsxexpdp, xt.encoding(), xb.encoding()));
++}
++
++// POWER9 (ISA 3.0) FP16 load/store, X-form indexed. lxsihzx loads
++// 16 bits into VSR dword 0 word 1's low halfword (zeroing the rest);
++// stxsihx stores from there. The XT[5]/XS[5] bit travels via the
++// X-form's TX/SX bit at instruction bit 0.
++BufferOffset Assembler::as_lxsihzx(FloatRegister xt, Register ra, Register rb) {
++  spew("lxsihzx\t%3s,%3s,%3s", xt.name(), ra.name(), rb.name());
++  return writeInst(PPC_lxsihzx | (xt.encoding() & 31) << 21 |
++                   ra.code() << 16 | rb.code() << 11 |
++                   ((xt.encoding() >> 5) & 1));
++}
++
++BufferOffset Assembler::as_stxsihx(FloatRegister xs, Register ra, Register rb) {
++  spew("stxsihx\t%3s,%3s,%3s", xs.name(), ra.name(), rb.name());
++  return writeInst(PPC_stxsihx | (xs.encoding() & 31) << 21 |
++                   ra.code() << 16 | rb.code() << 11 |
++                   ((xs.encoding() >> 5) & 1));
++}
++
++// XX3-form, FPR-space only (encoding 0..31 → VSR0..31, all AX/BX/TX = 0).
++// Java/JavaScript-style scalar max/min — semantics verified to match
++// ECMA-262 Math.max/Math.min including ±0 and NaN propagation. POWER9-only.
++BufferOffset Assembler::as_xsmaxjdp(FloatRegister xt, FloatRegister xa,
++                                    FloatRegister xb) {
++  spew("xsmaxjdp\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());
++  return writeInst(XX3Form(PPC_xsmaxjdp, xt, xa, xb));
++}
++
++BufferOffset Assembler::as_xsminjdp(FloatRegister xt, FloatRegister xa,
++                                    FloatRegister xb) {
++  spew("xsminjdp\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());
++  return writeInst(XX3Form(PPC_xsminjdp, xt, xa, xb));
++}
++
++// --- VSX SIMD load/store ---
++
++// For VSX0-31 (FPR), the 6th register bit (TX/SX/BX) is 0.
++// X-form: opcode | T << 21 | A << 16 | B << 11 | xo | TX
++// lxvx/stxvx are POWER9 (ISA 3.0). lxvd2x/stxvd2x are POWER8 (ISA 2.07).
++
++BufferOffset Assembler::as_lxvx(FloatRegister xt, Register ra, Register rb) {
++  spew("lxvx\t%3s,%3s,%3s", xt.name(), ra.name(), rb.name());
++  return writeInst(XX1Form(PPC_lxvx, xt.encoding(), ra.code(), rb.code()));
++}
++
++BufferOffset Assembler::as_stxvx(FloatRegister xs, Register ra, Register rb) {
++  spew("stxvx\t%3s,%3s,%3s", xs.name(), ra.name(), rb.name());
++  return writeInst(XX1Form(PPC_stxvx, xs.encoding(), ra.code(), rb.code()));
++}
++
++BufferOffset Assembler::as_lxvd2x(FloatRegister xt, Register ra, Register rb) {
++  spew("lxvd2x\t%3s,%3s,%3s", xt.name(), ra.name(), rb.name());
++  return writeInst(XX1Form(PPC_lxvd2x, xt.encoding(), ra.code(), rb.code()));
++}
++
++BufferOffset Assembler::as_stxvd2x(FloatRegister xs, Register ra, Register rb) {
++  spew("stxvd2x\t%3s,%3s,%3s", xs.name(), ra.name(), rb.name());
++  return writeInst(XX1Form(PPC_stxvd2x, xs.encoding(), ra.code(), rb.code()));
++}
++
++// VMX register load/store. See PPC_lvx/PPC_stvx in Assembler-ppc64.h for
++// the encoding rationale.
++BufferOffset Assembler::as_lvx(uint8_t vrt, Register ra, Register rb) {
++  MOZ_ASSERT(vrt < 32);
++  spew("lvx\tvr%d,%3s,%3s", vrt, ra.name(), rb.name());
++  return writeInst(PPC_lvx | uint32_t(vrt) << 21 | ra.code() << 16 |
++                   rb.code() << 11);
++}
++
++BufferOffset Assembler::as_stvx(uint8_t vrs, Register ra, Register rb) {
++  MOZ_ASSERT(vrs < 32);
++  spew("stvx\tvr%d,%3s,%3s", vrs, ra.name(), rb.name());
++  return writeInst(PPC_stvx | uint32_t(vrs) << 21 | ra.code() << 16 |
++                   rb.code() << 11);
++}
++
++// --- VSX SIMD register operations ---
++
++// XX3-form: opcode | T[0:4]<<21 | A[0:4]<<16 | B[0:4]<<11 | xo | AX | BX | TX
++// where AX/BX/TX (bits 2/1/0) carry bit 5 of each 6-bit VSR index.
++// Encoded by the XX3Form helper above for both VSR0-31 (Single/Double) and
++// VSR32-63 (Simd128) operands.
++BufferOffset Assembler::as_xxlor(FloatRegister xt, FloatRegister xa,
++                                 FloatRegister xb) {
++  spew("xxlor\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());
++  return writeInst(XX3Form(PPC_xxlor, xt, xa, xb));
++}
++
++BufferOffset Assembler::as_xxland(FloatRegister xt, FloatRegister xa,
++                                  FloatRegister xb) {
++  spew("xxland\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());
++  return writeInst(XX3Form(PPC_xxland, xt, xa, xb));
++}
++
++BufferOffset Assembler::as_xxlxor(FloatRegister xt, FloatRegister xa,
++                                  FloatRegister xb) {
++  spew("xxlxor\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());
++  return writeInst(XX3Form(PPC_xxlxor, xt, xa, xb));
++}
++
++BufferOffset Assembler::as_xxlnor(FloatRegister xt, FloatRegister xa,
++                                  FloatRegister xb) {
++  spew("xxlnor\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());
++  return writeInst(XX3Form(PPC_xxlnor, xt, xa, xb));
++}
++
++BufferOffset Assembler::as_xxlandc(FloatRegister xt, FloatRegister xa,
++                                   FloatRegister xb) {
++  spew("xxlandc\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());
++  return writeInst(XX3Form(PPC_xxlandc, xt, xa, xb));
++}
++
++BufferOffset Assembler::as_xxsel(FloatRegister xt, FloatRegister xa,
++                                 FloatRegister xb, FloatRegister xc) {
++  spew("xxsel\t%3s,%3s,%3s,%3s", xt.name(), xa.name(), xb.name(), xc.name());
++  return writeInst(XX4Form(PPC_xxsel, xt.encoding(), xa.encoding(),
++                           xb.encoding(), xc.encoding()));
++}
++
++BufferOffset Assembler::as_xxpermdi(FloatRegister xt, FloatRegister xa,
++                                    FloatRegister xb, uint8_t dm) {
++  MOZ_ASSERT(dm < 4);
++  spew("xxpermdi\t%3s,%3s,%3s,%d", xt.name(), xa.name(), xb.name(), dm);
++  return writeInst(XX3Form(PPC_xxpermdi | (uint32_t(dm) << 8), xt, xa, xb));
++}
++
++// POWER9 (ISA 3.0). XX1-form with two GPR sources.
++BufferOffset Assembler::as_mtvsrdd(FloatRegister xt, Register ra, Register rb) {
++  spew("mtvsrdd\t%3s,%3s,%3s", xt.name(), ra.name(), rb.name());
++  return writeInst(XX1Form(PPC_mtvsrdd, xt.encoding(), ra.code(), rb.code()));
++}
++
++// POWER9 (ISA 3.0). XX1-form: move lower doubleword of VSR to GPR.
++BufferOffset Assembler::as_mfvsrld(Register rt, FloatRegister xs) {
++  spew("mfvsrld\t%3s,%3s", rt.name(), xs.name());
++  return writeInst(XX1FormMfvsr(PPC_mfvsrld, rt.code(), xs.encoding()));
++}
++
++// --- XX2-form VSX instructions ---
++
++// XX2-form: opcode | T<<21 | UIM<<16_area | B<<11_area | XO<<2 | BX | TX
++// For VSR0-31, BX=TX=0.
++
++BufferOffset Assembler::as_xxspltw(FloatRegister xt, FloatRegister xb,
++                                   uint8_t uim) {
++  MOZ_ASSERT(uim < 4);
++  spew("xxspltw\t%3s,%3s,%d", xt.name(), xb.name(), uim);
++  return writeInst(XX2Form(PPC_xxspltw, xt.encoding(), xb.encoding(), uim));
++}
++
++BufferOffset Assembler::as_xxinsertw(FloatRegister xt, FloatRegister xb,
++                                     uint8_t uim) {
++  MOZ_ASSERT(uim <= 12 && (uim & 3) == 0);
++  spew("xxinsertw\t%3s,%3s,%d", xt.name(), xb.name(), uim);
++  return writeInst(XX2Form(PPC_xxinsertw, xt.encoding(), xb.encoding(), uim));
++}
++
++BufferOffset Assembler::as_xxextractuw(FloatRegister xt, FloatRegister xb,
++                                       uint8_t uim) {
++  MOZ_ASSERT(uim <= 12 && (uim & 3) == 0);
++  spew("xxextractuw\t%3s,%3s,%d", xt.name(), xb.name(), uim);
++  return writeInst(XX2Form(PPC_xxextractuw, xt.encoding(), xb.encoding(), uim));
++}
++
++// POWER9 (ISA 3.0). XX1-form-ish: T(5) + UIM8(8) + XO + TX. UIM8 occupies
++// bits 18..11 (a non-standard slot that XX1Form doesn't fit), so encode
++// inline. TX bit at instruction bit 0 selects the upper half of VSR
++// space when xt.encoding() is in 32-63 (Simd128).
++BufferOffset Assembler::as_xxspltib(FloatRegister xt, uint8_t imm8) {
++  spew("xxspltib\t%3s,%u", xt.name(), imm8);
++  uint32_t enc = uint32_t(xt.encoding());
++  return writeInst(PPC_xxspltib | (enc & 31) << 21 | (uint32_t)imm8 << 11 |
++                   ((enc >> 5) & 1));
++}
++
++// --- VMX instructions ---
++
++// VX-form: (4<<26) | VRT<<21 | UIMM<<16 | VRB<<11 | XO
++// VRT/VRB are 5-bit raw VR numbers (0-31). Simd128 FloatRegister.encoding()
++// returns 32-63; masking with & 31 maps it back to the VR offset 0-31.
++BufferOffset Assembler::as_vspltb(FloatRegister vrt, FloatRegister vrb,
++                                  uint8_t uim) {
++  MOZ_ASSERT(uim < 16);
++  spew("vspltb\t%3s,%3s,%d", vrt.name(), vrb.name(), uim);
++  return writeInst(PPC_vspltb | (vrt.encoding() & 31) << 21 |
++                   (uint32_t)uim << 16 | (vrb.encoding() & 31) << 11);
++}
++
++BufferOffset Assembler::as_vsplth(FloatRegister vrt, FloatRegister vrb,
++                                  uint8_t uim) {
++  MOZ_ASSERT(uim < 8);
++  spew("vsplth\t%3s,%3s,%d", vrt.name(), vrb.name(), uim);
++  return writeInst(PPC_vsplth | (vrt.encoding() & 31) << 21 |
++                   (uint32_t)uim << 16 | (vrb.encoding() & 31) << 11);
++}
++
++// VA-form: (4<<26) | VRT<<21 | VRA<<16 | VRB<<11 | SHB<<6 | XO(6-bit)
++BufferOffset Assembler::as_vsldoi(FloatRegister vrt, FloatRegister vra,
++                                  FloatRegister vrb, uint8_t shb) {
++  MOZ_ASSERT(shb < 16);
++  spew("vsldoi\t%3s,%3s,%3s,%d", vrt.name(), vra.name(), vrb.name(), shb);
++  return writeInst(PPC_vsldoi | (vrt.encoding() & 31) << 21 |
++                   (vra.encoding() & 31) << 16 | (vrb.encoding() & 31) << 11 |
++                   (uint32_t)shb << 6);
++}
++
++// --- VMX integer arithmetic (VR registers only) ---
++
++// VX-form: (4<<26) | VRT<<21 | VRA<<16 | VRB<<11 | XO
++// The macro takes raw VR numbers (0-31).
++#define DEF_VMX_VVV(op)                                                    \
++  BufferOffset Assembler::as_##op(uint8_t vrt, uint8_t vra, uint8_t vrb) { \
++    MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32);                          \
++    spew(#op "\tvr%d,vr%d,vr%d", vrt, vra, vrb);                           \
++    return writeInst(PPC_##op | vrt << 21 | vra << 16 | vrb << 11);        \
++  }
++
++DEF_VMX_VVV(vaddubm)
++DEF_VMX_VVV(vadduhm)
++DEF_VMX_VVV(vadduwm)
++DEF_VMX_VVV(vaddudm)
++DEF_VMX_VVV(vsububm)
++DEF_VMX_VVV(vsubuhm)
++DEF_VMX_VVV(vsubuwm)
++DEF_VMX_VVV(vsubudm)
++DEF_VMX_VVV(vaddsbs)
++DEF_VMX_VVV(vaddshs)
++DEF_VMX_VVV(vaddubs)
++DEF_VMX_VVV(vadduhs)
++DEF_VMX_VVV(vsubsbs)
++DEF_VMX_VVV(vsubshs)
++DEF_VMX_VVV(vsububs)
++DEF_VMX_VVV(vsubuhs)
++DEF_VMX_VVV(vminsb)
++DEF_VMX_VVV(vminsh)
++DEF_VMX_VVV(vminsw)
++DEF_VMX_VVV(vmaxsb)
++DEF_VMX_VVV(vmaxsh)
++DEF_VMX_VVV(vmaxsw)
++DEF_VMX_VVV(vmaxsd)
++DEF_VMX_VVV(vminub)
++DEF_VMX_VVV(vminuh)
++DEF_VMX_VVV(vminuw)
++DEF_VMX_VVV(vmaxub)
++DEF_VMX_VVV(vmaxuh)
++DEF_VMX_VVV(vmaxuw)
++DEF_VMX_VVV(vavgub)
++DEF_VMX_VVV(vavguh)
++DEF_VMX_VVV(vmuluwm)
++DEF_VMX_VVV(vmulld)
++
++DEF_VMX_VVV(vslb)
++DEF_VMX_VVV(vslh)
++DEF_VMX_VVV(vslw)
++DEF_VMX_VVV(vsld)
++DEF_VMX_VVV(vsrb)
++DEF_VMX_VVV(vsrh)
++DEF_VMX_VVV(vsrw)
++DEF_VMX_VVV(vsrd)
++DEF_VMX_VVV(vsrab)
++DEF_VMX_VVV(vsrah)
++DEF_VMX_VVV(vsraw)
++DEF_VMX_VVV(vsrad)
++DEF_VMX_VVV(vslo)
++DEF_VMX_VVV(vsro)
++DEF_VMX_VVV(vcmpequb)
++DEF_VMX_VVV(vcmpequh)
++DEF_VMX_VVV(vcmpequw)
++DEF_VMX_VVV(vcmpequd)
++DEF_VMX_VVV(vcmpgtsb)
++DEF_VMX_VVV(vcmpgtsh)
++DEF_VMX_VVV(vcmpgtsw)
++DEF_VMX_VVV(vcmpgtsd)
++DEF_VMX_VVV(vcmpgtub)
++DEF_VMX_VVV(vcmpgtuh)
++DEF_VMX_VVV(vcmpgtuw)
++DEF_VMX_VVV(vcmpgtud)
++// POWER9 (ISA 3.0). NotEqual compare; saves the xxlnor that vcmpequX needs.
++DEF_VMX_VVV(vcmpneb)
++DEF_VMX_VVV(vcmpneh)
++DEF_VMX_VVV(vcmpnew)
++
++// POWER8+ (ISA 2.07). vbpermq RT,RA,RB: bit-permute quadword.
++DEF_VMX_VVV(vbpermq)
++
++#undef DEF_VMX_VVV
++
++// VC-form record forms: same as VX-form above with Rc bit (bit 10 LSB) set.
++// vcmpXXX. sets CR6: LT = all-true, EQ = none-true.
++#define DEF_VMX_VVV_RC(op)                                                  \
++  BufferOffset Assembler::as_##op##_rc(uint8_t vrt, uint8_t vra,            \
++                                       uint8_t vrb) {                       \
++    MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32);                           \
++    spew(#op ".\tvr%d,vr%d,vr%d", vrt, vra, vrb);                           \
++    return writeInst(PPC_##op | vrt << 21 | vra << 16 | vrb << 11 | 0x400); \
++  }
++
++DEF_VMX_VVV_RC(vcmpequb)
++DEF_VMX_VVV_RC(vcmpequh)
++DEF_VMX_VVV_RC(vcmpequw)
++DEF_VMX_VVV_RC(vcmpequd)
++
++#undef DEF_VMX_VVV_RC
++
++// VSX float compare (XX3-form).
++#define DEF_VSX_CMP(op)                                               \
++  BufferOffset Assembler::as_##op(FloatRegister xt, FloatRegister xa, \
++                                  FloatRegister xb) {                 \
++    spew(#op "\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());       \
++    return writeInst(XX3Form(PPC_##op, xt, xa, xb));                  \
++  }
++
++DEF_VSX_CMP(xvcmpeqsp)
++DEF_VSX_CMP(xvcmpgtsp)
++DEF_VSX_CMP(xvcmpgesp)
++DEF_VSX_CMP(xvcmpeqdp)
++DEF_VSX_CMP(xvcmpgtdp)
++DEF_VSX_CMP(xvcmpgedp)
++
++#undef DEF_VSX_CMP
++
++// VSX float arithmetic (XX3-form binary).
++#define DEF_VSX_BIN(op)                                               \
++  BufferOffset Assembler::as_##op(FloatRegister xt, FloatRegister xa, \
++                                  FloatRegister xb) {                 \
++    spew(#op "\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());       \
++    return writeInst(XX3Form(PPC_##op, xt, xa, xb));                  \
++  }
++DEF_VSX_BIN(xvaddsp)
++DEF_VSX_BIN(xvadddp) DEF_VSX_BIN(xvsubsp) DEF_VSX_BIN(xvsubdp) DEF_VSX_BIN(
++    xvmulsp) DEF_VSX_BIN(xvmuldp) DEF_VSX_BIN(xvdivsp) DEF_VSX_BIN(xvdivdp)
++    DEF_VSX_BIN(xvminsp) DEF_VSX_BIN(xvmindp) DEF_VSX_BIN(xvmaxsp) DEF_VSX_BIN(
++        xvmaxdp) DEF_VSX_BIN(xvmaddasp) DEF_VSX_BIN(xvmaddadp)
++        DEF_VSX_BIN(xvnmsubasp) DEF_VSX_BIN(xvnmsubadp)
++#undef DEF_VSX_BIN
++
++// VSX unary (XX2-form): op | xt<<21 | xb<<11 | XO<<2
++// XX2-form unary VSX op: T + B, no UIM. Uses XX2Form helper for TX/BX bits.
++#define DEF_VSX_UN(op)                                                  \
++  BufferOffset Assembler::as_##op(FloatRegister xt, FloatRegister xb) { \
++    spew(#op "\t%3s,%3s", xt.name(), xb.name());                        \
++    return writeInst(XX2Form(PPC_##op, xt.encoding(), xb.encoding()));  \
++  }
++            DEF_VSX_UN(xvabssp) DEF_VSX_UN(xvabsdp) DEF_VSX_UN(
++                xvnegsp) DEF_VSX_UN(xvnegdp) DEF_VSX_UN(xvsqrtsp)
++                DEF_VSX_UN(xvsqrtdp) DEF_VSX_UN(xvrspip) DEF_VSX_UN(
++                    xvrdpip) DEF_VSX_UN(xvrspim) DEF_VSX_UN(xvrdpim)
++                    DEF_VSX_UN(xvrspiz) DEF_VSX_UN(xvrdpiz) DEF_VSX_UN(
++                        xvrspic) DEF_VSX_UN(xvrdpic) DEF_VSX_UN(xvcvsxwsp)
++                        DEF_VSX_UN(xvcvuxwsp) DEF_VSX_UN(xvcvsxwdp) DEF_VSX_UN(
++                            xvcvuxwdp) DEF_VSX_UN(xvcvspsxws)
++                            DEF_VSX_UN(xvcvspuxws) DEF_VSX_UN(xvcvdpsxws)
++                                DEF_VSX_UN(xvcvdpuxws) DEF_VSX_UN(xvcvdpsp)
++                                    DEF_VSX_UN(xvcvspdp)
++#undef DEF_VSX_UN
++
++// VMX unary VX-form: (4<<26) | VRT<<21 | 0<<16 | VRB<<11 | XO
++#define DEF_VMX_UNARY(op)                                     \
++  BufferOffset Assembler::as_##op(uint8_t vrt, uint8_t vrb) { \
++    MOZ_ASSERT(vrt < 32 && vrb < 32);                         \
++    spew(#op "\tvr%d,vr%d", vrt, vrb);                        \
++    return writeInst(PPC_##op | vrt << 21 | vrb << 11);       \
++  }
++                                        DEF_VMX_UNARY(vupkhsb) DEF_VMX_UNARY(
++                                            vupklsb) DEF_VMX_UNARY(vupkhsh)
++                                            DEF_VMX_UNARY(vupklsh)
++                                                DEF_VMX_UNARY(vupkhsw)
++                                                    DEF_VMX_UNARY(vupklsw)
++    // POWER9 per-lane integer negate. The VRA field holds the subop code
++    // (6 for vnegw, 7 for vnegd) which is already baked into PPC_vneg{w,d}.
++    DEF_VMX_UNARY(vnegw) DEF_VMX_UNARY(vnegd) DEF_VMX_UNARY(vpopcntb)
++#undef DEF_VMX_UNARY
++
++    // POWER9 addpcis (DX-form). Computes rT = (CIA + 4) + (D << 16).
++    // D is a 16-bit signed immediate, split across three instruction fields:
++    //   d0 = bits 16..25 (10 bits, D[15:6])
++    //   d1 = bits 11..15 (5 bits,  D[5:1])
++    //   d2 = bit 31      (1 bit,   D[0])
++    // Primary opcode 19, DX subop 2.
++    BufferOffset Assembler::as_addpcis(Register rt, int16_t d) {
++  spew("addpcis\t%s,%d", rt.name(), (int)d);
++  uint32_t D = uint16_t(d);
++  uint32_t inst = (19u << 26) | (uint32_t(rt.code()) << 21) |
++                  ((D >> 1) & 0x1F) << 16 | ((D >> 6) & 0x3FF) << 6 |
++                  (2u << 1) | (D & 1u);
++  return writeInst(inst);
++}
++
++// -----------------------------------------------------------------------------
++// Power ISA v3.1 (POWER10) prefixed instructions.
++//
++// Layout:
++//
++//   Prefix word (BE bit numbering from the manual; LE bits in parentheses):
++//     [0..5]   primary opcode = 1   (LE 31..26)
++//     [6..7]   Type: 00 = 8LS, 10 = MLS   (LE 25..24)
++//     [8..10]  reserved = 0   (LE 23..21)
++//     [11]     R: 1 = PC-relative (RA must be r0)   (LE 20)
++//     [12..13] reserved = 0   (LE 19..18)
++//     [14..31] d0: high 18 bits of 34-bit signed immediate   (LE 17..0)
++//
++//   Suffix (paddi/pld, GPR target):
++//     [0..5]   suffix opcode (paddi=14, pld=57)   (LE 31..26)
++//     [6..10]  RT   (LE 25..21)
++//     [11..15] RA   (LE 20..16)
++//     [16..31] d1: low 16 bits of immediate   (LE 15..0)
++//
++//   Suffix (plxv, VSR target — has the TX bit at suffix bit 5/LE bit 26):
++//     [0..4]   plxv 5-bit opcode = 11001 (=25)   (LE 31..27)
++//     [5]      TX (high bit of 6-bit XT)   (LE 26)
++//     [6..10]  T  (low 5 bits of XT)   (LE 25..21)
++//     [11..15] RA   (LE 20..16)
++//     [16..31] d1   (LE 15..0)
++//
++// The prefix and suffix of a prefixed instruction must lie in the same
++// 64-byte aligned block at **runtime**. The JitCode allocator only
++// guarantees 16-byte alignment, so the buffer-relative offset and the
++// runtime address can differ by 0/16/32/48 mod 64. A buffer-only check
++// `(currentOffset() & 63) == 60` is correct when the allocator base is
++// 64-aligned but misses three of the four 16-aligned base classes — pad
++// whenever `(currentOffset() & 15) == 12`, which catches all four. The
++// enterNoPool guard prevents the constant-pool flusher from inserting
++// bodies between the (optional) nop, prefix, and suffix.
++
++static uint32_t EncodePower10Prefix(uint32_t type, bool R, uint32_t d0) {
++  MOZ_ASSERT(type == 0 || type == 2);  // 8LS=0, MLS=2
++  MOZ_ASSERT(d0 < (1u << 18));
++  return (1u << 26) | (type << 24) | (uint32_t(R ? 1 : 0) << 20) |
++         (d0 & 0x3FFFFu);
++}
++
++static void SplitImm34(int64_t imm34, uint32_t* d0, uint32_t* d1) {
++  MOZ_ASSERT(imm34 >= -(int64_t(1) << 33));
++  MOZ_ASSERT(imm34 < (int64_t(1) << 33));
++  uint64_t u = uint64_t(imm34) & 0x3FFFFFFFFull;  // low 34 bits
++  *d0 = uint32_t(u >> 16) & 0x3FFFFu;             // 18 bits
++  *d1 = uint32_t(u) & 0xFFFFu;                    // 16 bits
++}
++
++void Assembler::ensurePrefixedAlignment() {
++  if ((currentOffset() & 15) == 12) {
++    as_nop();
++  }
++}
++
++// paddi RT, RA, SI, R   (MLS, suffix opcode 14 = addi)
++//   R=0: RT = (RA==0 ? 0 : RA) + sign_extend(SI, 34)
++//   R=1: RT = CIA(prefix) + sign_extend(SI, 34)   (RA must be r0)
++BufferOffset Assembler::as_paddi(Register rt, Register ra, int64_t imm34,
++                                  bool R) {
++  MOZ_ASSERT_IF(R, ra == r0);
++  spew("paddi\t%s,%s,%lld,%d", rt.name(), ra.name(), (long long)imm34,
++       R ? 1 : 0);
++  uint32_t d0, d1;
++  SplitImm34(imm34, &d0, &d1);
++  uint32_t prefix = EncodePower10Prefix(/*type=MLS*/ 2, R, d0);
++  uint32_t suffix = (14u << 26) | (uint32_t(rt.code()) << 21) |
++                    (uint32_t(ra.code()) << 16) | d1;
++  // Reservation = nop (worst case) + prefix + suffix.
++  m_buffer.enterNoPool(3);
++  ensurePrefixedAlignment();
++  BufferOffset bo = writeInst(prefix);
++  writeInst(suffix);
++  m_buffer.leaveNoPool();
++  return bo;
++}
++
++// pld RT, D(RA), R   (8LS, suffix opcode 57)
++BufferOffset Assembler::as_pld(Register rt, Register ra, int64_t imm34,
++                                bool R) {
++  MOZ_ASSERT_IF(R, ra == r0);
++  spew("pld\t%s,%lld(%s),%d", rt.name(), (long long)imm34, ra.name(),
++       R ? 1 : 0);
++  uint32_t d0, d1;
++  SplitImm34(imm34, &d0, &d1);
++  uint32_t prefix = EncodePower10Prefix(/*type=8LS*/ 0, R, d0);
++  uint32_t suffix = (57u << 26) | (uint32_t(rt.code()) << 21) |
++                    (uint32_t(ra.code()) << 16) | d1;
++  m_buffer.enterNoPool(3);
++  ensurePrefixedAlignment();
++  BufferOffset bo = writeInst(prefix);
++  writeInst(suffix);
++  m_buffer.leaveNoPool();
++  return bo;
++}
++
++// plxv XT, D(RA), R   (8LS, 5-bit suffix opcode 25, TX in suffix bit 26)
++//   XT is 6-bit: TX (high) || T (low 5) — matches lxvx convention.
++BufferOffset Assembler::as_plxv(uint8_t xt, Register ra, int64_t imm34,
++                                 bool R) {
++  MOZ_ASSERT(xt < 64);
++  MOZ_ASSERT_IF(R, ra == r0);
++  spew("plxv\tvs%u,%lld(%s),%d", xt, (long long)imm34, ra.name(),
++       R ? 1 : 0);
++  uint32_t d0, d1;
++  SplitImm34(imm34, &d0, &d1);
++  uint32_t prefix = EncodePower10Prefix(/*type=8LS*/ 0, R, d0);
++  uint32_t T = xt & 0x1Fu;
++  uint32_t TX = (xt >> 5) & 1u;
++  uint32_t suffix = (25u << 27) | (TX << 26) | (T << 21) |
++                    (uint32_t(ra.code()) << 16) | d1;
++  m_buffer.enterNoPool(3);
++  ensurePrefixedAlignment();
++  BufferOffset bo = writeInst(prefix);
++  writeInst(suffix);
++  m_buffer.leaveNoPool();
++  return bo;
++}
++
++// plfd FRT, D(RA), R   (MLS, suffix opcode 50; D-form-like FPR load)
++BufferOffset Assembler::as_plfd(FloatRegister frt, Register ra, int64_t imm34,
++                                 bool R) {
++  MOZ_ASSERT_IF(R, ra == r0);
++  spew("plfd\tf%u,%lld(%s),%d", uint32_t(frt.encoding()),
++       (long long)imm34, ra.name(), R ? 1 : 0);
++  uint32_t d0, d1;
++  SplitImm34(imm34, &d0, &d1);
++  uint32_t prefix = EncodePower10Prefix(/*type=MLS*/ 2, R, d0);
++  uint32_t suffix = (50u << 26) | (uint32_t(frt.encoding()) << 21) |
++                    (uint32_t(ra.code()) << 16) | d1;
++  m_buffer.enterNoPool(3);
++  ensurePrefixedAlignment();
++  BufferOffset bo = writeInst(prefix);
++  writeInst(suffix);
++  m_buffer.leaveNoPool();
++  return bo;
++}
++
++// plfs FRT, D(RA), R   (MLS, suffix opcode 48; widens single → double in FPR)
++BufferOffset Assembler::as_plfs(FloatRegister frt, Register ra, int64_t imm34,
++                                 bool R) {
++  MOZ_ASSERT_IF(R, ra == r0);
++  spew("plfs\tf%u,%lld(%s),%d", uint32_t(frt.encoding()),
++       (long long)imm34, ra.name(), R ? 1 : 0);
++  uint32_t d0, d1;
++  SplitImm34(imm34, &d0, &d1);
++  uint32_t prefix = EncodePower10Prefix(/*type=MLS*/ 2, R, d0);
++  uint32_t suffix = (48u << 26) | (uint32_t(frt.encoding()) << 21) |
++                    (uint32_t(ra.code()) << 16) | d1;
++  m_buffer.enterNoPool(3);
++  ensurePrefixedAlignment();
++  BufferOffset bo = writeInst(prefix);
++  writeInst(suffix);
++  m_buffer.leaveNoPool();
++  return bo;
++}
++
++// pstd RS, D(RA), R   (8LS, suffix opcode 61 = std D-form)
++BufferOffset Assembler::as_pstd(Register rs, Register ra, int64_t imm34,
++                                 bool R) {
++  MOZ_ASSERT_IF(R, ra == r0);
++  spew("pstd\t%s,%lld(%s),%d", rs.name(), (long long)imm34, ra.name(),
++       R ? 1 : 0);
++  uint32_t d0, d1;
++  SplitImm34(imm34, &d0, &d1);
++  uint32_t prefix = EncodePower10Prefix(/*type=8LS*/ 0, R, d0);
++  uint32_t suffix = (61u << 26) | (uint32_t(rs.code()) << 21) |
++                    (uint32_t(ra.code()) << 16) | d1;
++  m_buffer.enterNoPool(3);
++  ensurePrefixedAlignment();
++  BufferOffset bo = writeInst(prefix);
++  writeInst(suffix);
++  m_buffer.leaveNoPool();
++  return bo;
++}
++
++// pstxv XS, D(RA), R   (8LS, 5-bit suffix opcode 27, SX in suffix bit 26)
++//   XS is 6-bit: SX (high) || S (low 5) — matches stxvx convention.
++BufferOffset Assembler::as_pstxv(uint8_t xs, Register ra, int64_t imm34,
++                                  bool R) {
++  MOZ_ASSERT(xs < 64);
++  MOZ_ASSERT_IF(R, ra == r0);
++  spew("pstxv\tvs%u,%lld(%s),%d", xs, (long long)imm34, ra.name(),
++       R ? 1 : 0);
++  uint32_t d0, d1;
++  SplitImm34(imm34, &d0, &d1);
++  uint32_t prefix = EncodePower10Prefix(/*type=8LS*/ 0, R, d0);
++  uint32_t sx = (xs >> 5) & 1;
++  uint32_t s = xs & 0x1F;
++  uint32_t suffix = (27u << 27) | (sx << 26) | (s << 21) |
++                    (uint32_t(ra.code()) << 16) | d1;
++  m_buffer.enterNoPool(3);
++  ensurePrefixedAlignment();
++  BufferOffset bo = writeInst(prefix);
++  writeInst(suffix);
++  m_buffer.leaveNoPool();
++  return bo;
++}
++
++// pstfd FRS, D(RA), R   (MLS, suffix opcode 54 = stfd)
++BufferOffset Assembler::as_pstfd(FloatRegister frs, Register ra, int64_t imm34,
++                                  bool R) {
++  MOZ_ASSERT_IF(R, ra == r0);
++  spew("pstfd\tf%u,%lld(%s),%d", uint32_t(frs.encoding()),
++       (long long)imm34, ra.name(), R ? 1 : 0);
++  uint32_t d0, d1;
++  SplitImm34(imm34, &d0, &d1);
++  uint32_t prefix = EncodePower10Prefix(/*type=MLS*/ 2, R, d0);
++  uint32_t suffix = (54u << 26) | (uint32_t(frs.encoding()) << 21) |
++                    (uint32_t(ra.code()) << 16) | d1;
++  m_buffer.enterNoPool(3);
++  ensurePrefixedAlignment();
++  BufferOffset bo = writeInst(prefix);
++  writeInst(suffix);
++  m_buffer.leaveNoPool();
++  return bo;
++}
++
++// pstfs FRS, D(RA), R   (MLS, suffix opcode 52 = stfs)
++BufferOffset Assembler::as_pstfs(FloatRegister frs, Register ra, int64_t imm34,
++                                  bool R) {
++  MOZ_ASSERT_IF(R, ra == r0);
++  spew("pstfs\tf%u,%lld(%s),%d", uint32_t(frs.encoding()),
++       (long long)imm34, ra.name(), R ? 1 : 0);
++  uint32_t d0, d1;
++  SplitImm34(imm34, &d0, &d1);
++  uint32_t prefix = EncodePower10Prefix(/*type=MLS*/ 2, R, d0);
++  uint32_t suffix = (52u << 26) | (uint32_t(frs.encoding()) << 21) |
++                    (uint32_t(ra.code()) << 16) | d1;
++  m_buffer.enterNoPool(3);
++  ensurePrefixedAlignment();
++  BufferOffset bo = writeInst(prefix);
++  writeInst(suffix);
++  m_buffer.leaveNoPool();
++  return bo;
++}
++
++// POWER10 (ISA 3.1) Vector Extract Mask. RT (GPR) gets the wasm-spec
++// bitmask (one bit per lane MSB) directly in low 16/8/4/2 bits. UIM
++// is baked into PPC_vextract{b,h,w,d}m (8/9/10/11). Caller must have
++// verified HasPOWER10().
++#define DEF_VEXTRACT_M(op)                                                 \
++  BufferOffset Assembler::as_##op(Register rt, FloatRegister vrb) {        \
++    spew(#op "\t%s,vr%u", rt.name(), uint32_t(vrb.encoding() & 31));       \
++    return writeInst(PPC_##op | (uint32_t(rt.code()) << 21) |              \
++                     ((uint32_t(vrb.encoding()) & 31) << 11));             \
++  }
++DEF_VEXTRACT_M(vextractbm)
++DEF_VEXTRACT_M(vextracthm)
++DEF_VEXTRACT_M(vextractwm)
++DEF_VEXTRACT_M(vextractdm)
++#undef DEF_VEXTRACT_M
++
++// POWER10 (ISA 3.1) Vector Insert Word/Doubleword from GPR. VX-form:
++// VRT at bits 21..25, UIM at bits 16..20, RB at bits 11..15.
++#define DEF_VINS(op, max_uim)                                              \
++  BufferOffset Assembler::as_##op(FloatRegister vrt, Register rb,          \
++                                  uint8_t uim) {                           \
++    MOZ_ASSERT(uim <= (max_uim));                                          \
++    spew(#op "\tvr%u,%s,%u", uint32_t(vrt.encoding() & 31), rb.name(),     \
++         uint32_t(uim));                                                   \
++    return writeInst(PPC_##op |                                            \
++                     ((uint32_t(vrt.encoding()) & 31) << 21) |             \
++                     (uint32_t(uim) << 16) |                               \
++                     (uint32_t(rb.code()) << 11));                         \
++  }
++DEF_VINS(vinsw, 12)
++DEF_VINS(vinsd, 8)
++#undef DEF_VINS
++
++// POWER10 (ISA 3.1) Vector Insert byte/halfword from GPR with
++// register-supplied byte position. VX-form: VRT at bits 21..25,
++// RA at bits 16..20, RB at bits 11..15. "rx" is right-indexed
++// (LE-natural — index 0 = LSB byte).
++#define DEF_VINS_RX(op)                                                    \
++  BufferOffset Assembler::as_##op(FloatRegister vrt, Register ra,          \
++                                  Register rb) {                           \
++    spew(#op "\tvr%u,%s,%s", uint32_t(vrt.encoding() & 31), ra.name(),     \
++         rb.name());                                                       \
++    return writeInst(PPC_##op |                                            \
++                     ((uint32_t(vrt.encoding()) & 31) << 21) |             \
++                     (uint32_t(ra.code()) << 16) |                         \
++                     (uint32_t(rb.code()) << 11));                         \
++  }
++DEF_VINS_RX(vinsbrx)
++DEF_VINS_RX(vinshrx)
++#undef DEF_VINS_RX
++
++// POWER9 (ISA 3.0) V-form 3-operand instructions with VRT, UIM, VRB at
++// bits 21..25, 16..20, 11..15 respectively (vinsert{b,h}, vextract{ub,uh}).
++// Simd128 lives in VSR32-63 (= VR0-31), so we mask VRT and VRB to the
++// 5-bit VR field via `encoding() & 31`.
++#define DEF_VRT_UIM_VRB(op, max_uim, uim_step)                              \
++  BufferOffset Assembler::as_##op(FloatRegister vrt, FloatRegister vrb,    \
++                                  uint8_t uim) {                           \
++    MOZ_ASSERT(uim <= (max_uim));                                          \
++    MOZ_ASSERT((uim) % (uim_step) == 0);                                   \
++    spew(#op "\tvr%u,vr%u,%u", uint32_t(vrt.encoding() & 31),              \
++         uint32_t(vrb.encoding() & 31), uint32_t(uim));                    \
++    return writeInst(PPC_##op |                                            \
++                     ((uint32_t(vrt.encoding()) & 31) << 21) |             \
++                     (uint32_t(uim) << 16) |                               \
++                     ((uint32_t(vrb.encoding()) & 31) << 11));             \
++  }
++DEF_VRT_UIM_VRB(vinsertb, 15, 1)
++DEF_VRT_UIM_VRB(vinserth, 14, 2)
++DEF_VRT_UIM_VRB(vextractub, 15, 1)
++DEF_VRT_UIM_VRB(vextractuh, 14, 2)
++#undef DEF_VRT_UIM_VRB
++
++// VMX binary VX-form pack/merge (re-use DEF_VMX_VVV pattern).
++#define DEF_VMX_VVV(op)                                                    \
++  BufferOffset Assembler::as_##op(uint8_t vrt, uint8_t vra, uint8_t vrb) { \
++    MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32);                          \
++    spew(#op "\tvr%d,vr%d,vr%d", vrt, vra, vrb);                           \
++    return writeInst(PPC_##op | vrt << 21 | vra << 16 | vrb << 11);        \
++  }
++DEF_VMX_VVV(vpkshss)
++DEF_VMX_VVV(vpkswss) DEF_VMX_VVV(vpkshus) DEF_VMX_VVV(vpkswus)
++    DEF_VMX_VVV(vmrghb)
++        DEF_VMX_VVV(vmrghh) DEF_VMX_VVV(vmrghw) DEF_VMX_VVV(vmrglb)
++            DEF_VMX_VVV(vmrglh) DEF_VMX_VVV(vmrglw) DEF_VMX_VVV(vmulesb)
++                DEF_VMX_VVV(vmulosb) DEF_VMX_VVV(vmuleub) DEF_VMX_VVV(vmuloub)
++                    DEF_VMX_VVV(vmulesh) DEF_VMX_VVV(vmulosh)
++                        DEF_VMX_VVV(vmuleuh) DEF_VMX_VVV(vmulouh)
++                            DEF_VMX_VVV(vmulesw) DEF_VMX_VVV(vmulosw)
++                                DEF_VMX_VVV(vmuleuw) DEF_VMX_VVV(vmulouw)
++#undef DEF_VMX_VVV
++
++    // vperm VA-form: (4<<26) | VRT<<21 | VRA<<16 | VRB<<11 | VRC<<6 | XO
++    BufferOffset Assembler::as_vperm(uint8_t vrt, uint8_t vra, uint8_t vrb,
++                                     uint8_t vrc) {
++  MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32 && vrc < 32);
++  spew("vperm\tvr%d,vr%d,vr%d,vr%d", vrt, vra, vrb, vrc);
++  return writeInst(PPC_vperm | vrt << 21 | vra << 16 | vrb << 11 | vrc << 6);
++}
++
++// VA-form ternary VMX: (4<<26) | VRT<<21 | VRA<<16 | VRB<<11 | VRC<<6 |
++// XO(6-bit)
++BufferOffset Assembler::as_vmladduhm(uint8_t vrt, uint8_t vra, uint8_t vrb,
++                                     uint8_t vrc) {
++  MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32 && vrc < 32);
++  spew("vmladduhm\tvr%d,vr%d,vr%d,vr%d", vrt, vra, vrb, vrc);
++  return writeInst(PPC_vmladduhm | vrt << 21 | vra << 16 | vrb << 11 |
++                   vrc << 6);
++}
++
++BufferOffset Assembler::as_vmhraddshs(uint8_t vrt, uint8_t vra, uint8_t vrb,
++                                      uint8_t vrc) {
++  MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32 && vrc < 32);
++  spew("vmhraddshs\tvr%d,vr%d,vr%d,vr%d", vrt, vra, vrb, vrc);
++  return writeInst(PPC_vmhraddshs | vrt << 21 | vra << 16 | vrb << 11 |
++                   vrc << 6);
++}
++
++BufferOffset Assembler::as_vmsumshm(uint8_t vrt, uint8_t vra, uint8_t vrb,
++                                    uint8_t vrc) {
++  MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32 && vrc < 32);
++  spew("vmsumshm\tvr%d,vr%d,vr%d,vr%d", vrt, vra, vrb, vrc);
++  return writeInst(PPC_vmsumshm | vrt << 21 | vra << 16 | vrb << 11 |
++                   vrc << 6);
++}
++
++BufferOffset Assembler::as_vmsumuhm(uint8_t vrt, uint8_t vra, uint8_t vrb,
++                                    uint8_t vrc) {
++  MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32 && vrc < 32);
++  spew("vmsumuhm\tvr%d,vr%d,vr%d,vr%d", vrt, vra, vrb, vrc);
++  return writeInst(PPC_vmsumuhm | vrt << 21 | vra << 16 | vrb << 11 |
++                   vrc << 6);
++}
++
++BufferOffset Assembler::as_vspltisb(uint8_t vrt, int8_t simm5) {
++  MOZ_ASSERT(vrt < 32);
++  MOZ_ASSERT(simm5 >= -16 && simm5 <= 15);
++  spew("vspltisb\tvr%d,%d", vrt, simm5);
++  return writeInst(PPC_vspltisb | uint32_t(vrt) << 21 |
++                   (uint32_t(simm5) & 0x1F) << 16);
++}
++
++BufferOffset Assembler::as_vspltish(uint8_t vrt, int8_t simm5) {
++  MOZ_ASSERT(vrt < 32);
++  MOZ_ASSERT(simm5 >= -16 && simm5 <= 15);
++  spew("vspltish\tvr%d,%d", vrt, simm5);
++  return writeInst(PPC_vspltish | uint32_t(vrt) << 21 |
++                   (uint32_t(simm5) & 0x1F) << 16);
++}
++
++BufferOffset Assembler::as_vspltisw(uint8_t vrt, int8_t simm5) {
++  MOZ_ASSERT(vrt < 32);
++  MOZ_ASSERT(simm5 >= -16 && simm5 <= 15);
++  spew("vspltisw\tvr%d,%d", vrt, simm5);
++  return writeInst(PPC_vspltisw | uint32_t(vrt) << 21 |
++                   (uint32_t(simm5) & 0x1F) << 16);
++}
++
++// --- Convenience pseudo-instructions ---
++
++BufferOffset Assembler::xs_trap() {
++  spew("trap @ %08x", currentOffset());
++  return writeInst(PPC_trap);
++}
++
++BufferOffset Assembler::xs_trap_tagged(TrapTag tag) {
++  uint32_t tv = PPC_trap | ((uint8_t)tag << 16) | ((uint8_t)tag << 11);
++  spew("trap @ %08x ; MARK %d %08x", currentOffset(), (uint8_t)tag, tv);
++  return writeInst(tv);
++}
++
++BufferOffset Assembler::xs_mr(Register rd, Register ra) {
++  return as_or_(rd, ra, ra);
++}
++
++BufferOffset Assembler::xs_mtctr(Register ra) {
++  return as_mtspr((SPRegisterID)spr_ctr, ra);
++}
++
++BufferOffset Assembler::xs_mtlr(Register ra) {
++  return as_mtspr((SPRegisterID)spr_lr, ra);
++}
++
++BufferOffset Assembler::xs_mflr(Register rd) {
++  return as_mfspr(rd, (SPRegisterID)spr_lr);
++}
++
++BufferOffset Assembler::xs_mtcr(Register rs) { return as_mtcrf(0xff, rs); }
++
++BufferOffset Assembler::xs_mfxer(Register ra) {
++  return as_mfspr(ra, (SPRegisterID)spr_xer);
++}
++
++BufferOffset Assembler::xs_mtxer(Register ra) {
++  return as_mtspr((SPRegisterID)spr_xer, ra);
++}
++
++BufferOffset Assembler::xs_li(Register rd, int16_t im) {
++  return as_addi(rd, r0, im, true);
++}
++
++BufferOffset Assembler::xs_lis(Register rd, int16_t im) {
++  return as_addis(rd, r0, im, true);
++}
++
++BufferOffset Assembler::x_subi(Register rd, Register ra, int16_t im) {
++  return as_addi(rd, ra, -im);
++}
++
++BufferOffset Assembler::x_not(Register rd, Register ra) {
++  return as_nor(rd, ra, ra);
++}
++
++BufferOffset Assembler::x_slwi(Register rd, Register rs, int n) {
++  MOZ_ASSERT(n >= 0 && n < 32);
++  return as_rlwinm(rd, rs, n, 0, 31 - n);
++}
++
++BufferOffset Assembler::x_sldi(Register rd, Register rs, int n) {
++  return as_rldicr(rd, rs, n, 63 - n);
++}
++
++BufferOffset Assembler::x_srwi(Register rd, Register rs, int n) {
++  MOZ_ASSERT(n >= 0 && n < 32);
++  if (n == 0) {
++    return as_rlwinm(rd, rs, 0, 0, 31);
++  }
++  return as_rlwinm(rd, rs, 32 - n, n, 31);
++}
++
++BufferOffset Assembler::x_srdi(Register rd, Register rs, int n) {
++  MOZ_ASSERT(n >= 0 && n < 64);
++  if (n == 0) {
++    return as_or_(rd, rs, rs);
++  }
++  return as_rldicl(rd, rs, 64 - n, n);
++}
++
++BufferOffset Assembler::x_bit_value(Register rd, Register rs, unsigned bit) {
++  return as_rlwinm(rd, rs, bit + 1, 31, 31);
++}
++
++BufferOffset Assembler::x_insertbits0_15(Register rd, Register rs) {
++  return as_rlwimi(rd, rs, 0, 16, 31);
++}
++
++BufferOffset Assembler::x_sr_mulli(Register rd, Register ra, int16_t im) {
++  as_sradi(rd, ra, 63);
++  return as_mulli(rd, rd, im);
++}
++
++void Assembler::as_break(uint32_t code) {
++  spew("break\t%d", code);
++  writeInst(PPC_trap);
++}
++
++// ========================================================================
++// Label binding, retarget, and code label processing.
++// ========================================================================
++
++// Forward-declared shape helpers; full definitions and the layout
++// commentary live with the WriteLoad64Instructions section below.
++static bool IsAddpcisLoad64Stanza(uint32_t enc0);
++static uint8_t Load64StanzaDestReg(Instruction* inst0);
++
++InstImm Assembler::invertBranch(InstImm branch, BOffImm16 skipOffset) {
++  // Flip the BO condition-true/condition-false bit (bit 24).
++  uint32_t data = branch.encode();
++  data = (data ^ 0x01000000) & 0xFFFF0003;
++  data |= skipOffset.encode();
++  branch.setData(data);
++  return branch;
++}
++
++void Assembler::bind(InstImm* inst, uintptr_t branch, uintptr_t target) {
++  intptr_t offset = target - branch;
++  Instruction* i0 = (Instruction*)inst;
++
++  if (i0->next()->encode() == PPC_bcl_always_plus4 ||
++      IsAddpcisLoad64Stanza(i0->encode())) {
++    // Pre-existing long stanza, either P8 (mflr + bcl marker at [1]) or
++    // P9+ (addpcis at [0]; major opcode 19). Either way, just register
++    // the long jump — the stanza's .quad at [6..7] gets patched later
++    // via UpdateLoad64Value.
++    addLongJump(BufferOffset(branch), BufferOffset(target));
++    return;
++  }
++
++  if (i0->isOpcode((uint32_t)PPC_tw)) {
++    // Tagged trap stanza. The tag tells us which branch type was reserved.
++    TrapTag tag = (TrapTag)inst->traptag();
++    Instruction* i1 = i0->next();
++    Instruction* i2 = i1->next();
++    Instruction* i3 = i2->next();
++    Instruction* i4 = i3->next();
++    Instruction* i5 = i4->next();
++    Instruction* i6 = i5->next();
++    Instruction* i7 = i6->next();
++    Instruction* i8 = i7->next();
++    Instruction* i9 = i8->next();
++
++    switch (tag) {
++      case BCTag: {
++        // inst[-1] is the original bc instruction.
++        Instruction* bc = i0 - 1;
++        // Try short bc (offset + 4 because bc is one instruction before tw).
++        if (BOffImm16::IsInRange(offset + (intptr_t)sizeof(uint32_t))) {
++          bc->setData(((bc->encode() ^ 0x01000000) & 0xFFFF0003) |
++                      BOffImm16(offset + sizeof(uint32_t)).encode());
++          i0->makeNop();
++          i1->makeNop();
++          i2->makeNop();
++          i3->makeNop();
++          i4->makeNop();
++          i5->makeNop();
++          i6->makeNop();
++          i7->makeNop();
++          i8->makeNop();
++          i9->makeNop();
++          return;
++        }
++        // Try short b (unconditional).
++        if (JOffImm26::IsInRange(offset)) {
++          i0->setData(PPC_b | JOffImm26(offset).encode());
++          i1->makeNop();
++          i2->makeNop();
++          i3->makeNop();
++          i4->makeNop();
++          i5->makeNop();
++          i6->makeNop();
++          i7->makeNop();
++          i8->makeNop();
++          i9->makeNop();
++          return;
++        }
++        // Long: WriteLoad64 to SecondScratchReg + mtctr + bctr.
++        addLongJump(BufferOffset(branch), BufferOffset(target));
++        WriteLoad64Instructions(i0, SecondScratchReg,
++                                LabelBase::INVALID_OFFSET);
++        i8->makeOp_mtctr(SecondScratchReg);
++        i9->makeOp_bctr();
++        break;
++      }
++      case CallTag: {
++        // For calls, the actual call instruction goes at inst[9] and
++        // the return address must be after the stanza.
++        intptr_t callOffset = offset - 9 * (intptr_t)sizeof(uint32_t);
++        if (JOffImm26::IsInRange(callOffset)) {
++          i0->makeNop();
++          i1->makeNop();
++          i2->makeNop();
++          i3->makeNop();
++          i4->makeNop();
++          i5->makeNop();
++          i6->makeNop();
++          i7->makeNop();
++          i8->makeNop();
++          i9->setData(PPC_b | JOffImm26(callOffset).encode() | LinkB);
++          return;
++        }
++        // Long: WriteLoad64 to SecondScratchReg + mtctr + bctrl.
++        addLongJump(BufferOffset(branch), BufferOffset(target));
++        WriteLoad64Instructions(i0, SecondScratchReg,
++                                LabelBase::INVALID_OFFSET);
++        i8->makeOp_mtctr(SecondScratchReg);
++        i9->makeOp_bctr(LinkB);
++        break;
++      }
++      case BTag: {
++        if (JOffImm26::IsInRange(offset)) {
++          i0->setData(PPC_b | JOffImm26(offset).encode());
++          i1->makeNop();
++          i2->makeNop();
++          i3->makeNop();
++          i4->makeNop();
++          i5->makeNop();
++          i6->makeNop();
++          i7->makeNop();
++          i8->makeNop();
++          i9->makeNop();
++          return;
++        }
++        // Long: WriteLoad64 to SecondScratchReg + mtctr + bctr.
++        addLongJump(BufferOffset(branch), BufferOffset(target));
++        WriteLoad64Instructions(i0, SecondScratchReg,
++                                LabelBase::INVALID_OFFSET);
++        i8->makeOp_mtctr(SecondScratchReg);
++        i9->makeOp_bctr();
++        break;
++      }
++      default:
++        MOZ_CRASH("Unexpected TrapTag");
++    }
++    return;
++  }
++
++  if (i0->isOpcode(PPC_b)) {
++    // Short unconditional branch — set offset, nop next-in-chain slot.
++    MOZ_ASSERT(JOffImm26::IsInRange(offset));
++    i0->setData((i0->encode() & ~0x03FFFFFC) | JOffImm26(offset).encode());
++    i0->next()->makeNop();
++    return;
++  }
++
++  if (i0->isOpcode(PPC_bc)) {
++    // Short conditional branch — preserve upper 16 bits, set offset.
++    MOZ_ASSERT(BOffImm16::IsInRange(offset));
++    i0->setData((i0->encode() & 0xFFFF0003) | BOffImm16(offset).encode());
++    i0->next()->makeNop();
++    return;
++  }
++
++  MOZ_CRASH("Unexpected instruction in bind");
++}
++
++void Assembler::bind(Label* label, BufferOffset boff) {
++  if (label->used()) {
++    bool more;
++    BufferOffset b(label);
++    do {
++      BufferOffset next;
++      InstImm* inst = (InstImm*)editSrc(b);
++      Instruction* i1 = ((Instruction*)inst)->next();
++      more = (i1->encode() != LabelBase::INVALID_OFFSET);
++      if (more) {
++        next = BufferOffset(i1->encode());
++      }
++      bind(inst, b.getOffset(), boff.getOffset());
++      b = next;
++    } while (more);
++  }
++  label->bind(boff.getOffset());
++}
++
++void Assembler::retarget(Label* label, Label* target) {
++  spew("retarget");
++  if (label->used() && !oom()) {
++    if (target->bound()) {
++      bind(label, BufferOffset(target));
++    } else if (target->used()) {
++      // Prepend label's use chain to target's use chain.
++      BufferOffset b(label);
++      BufferOffset next;
++      do {
++        Instruction* inst = (Instruction*)editSrc(b);
++        Instruction* i1 = inst->next();
++        if (i1->encode() != LabelBase::INVALID_OFFSET) {
++          next = BufferOffset(i1->encode());
++        } else {
++          // End of label's chain — link to target's head.
++          i1->setData(target->offset());
++          break;
++        }
++        b = next;
++      } while (true);
++    }
++    // Transfer label's use list to target.
++    if (!target->bound()) {
++      target->use(label->offset());
++    }
++  }
++  label->reset();
++}
++
++void Assembler::processCodeLabels(uint8_t* rawCode) {
++  for (const CodeLabel& label : codeLabels_) {
++    Bind(rawCode, label);
++  }
++}
++
++// ========================================================================
++// Load64 instruction sequence (8 slots, literal pool format):
++//   [0] mflr r0            — save LR
++//   [1] bcl 20,0,.+4      — LR = address of [2]
++//   [2] mflr rD            — rD = address of [2]
++//   [3] mtlr r0            — restore LR
++//   [4] ld rD, 16(rD)      — load from [6..7] (offset = 24 - 8 = 16)
++//   [5] b .+12             — skip data
++//   [6..7] .quad VALUE     — 8-byte data
++// ========================================================================
++
++// ========================================================================
++// Constant pool callbacks (required by AssemblerBufferWithConstantPools).
++// ========================================================================
++
++/* static */
++void Assembler::InsertIndexIntoTag(uint8_t* load, uint32_t index) {
++  // Stash the pool entry index in the hint word's low 16 bits; the high
++  // bits carry the dest reg and load type, consumed by
++  // PatchConstantPoolLoad when the pool is resolved.
++  uint32_t* inst = (uint32_t*)load;
++  *inst = (*inst & 0xFFFF0000) | (index & 0xFFFF);
++}
++
++/* static */
++bool Assembler::PatchConstantPoolLoad(void* loadAddr, void* constPoolAddr) {
++  // Rewrite placeholder instructions with a pool load sequence.
++  // Hint word layout (set by loadFromPoolFloat64 / loadFromPoolFloat32 /
++  // loadFromPoolSimd128):
++  //   bits 0-15:  pool entry index
++  //   bits 16-20: destination register (FPR encoding)
++  //   bits 21-22: load type (PoolLoadFPR64, PoolLoadSimd128, PoolLoadFPR32)
++  //   bits 28-31: sentinel 0xF
++
++  uint32_t* inst = (uint32_t*)loadAddr;
++
++  uint32_t hint = inst[0];
++  uint32_t index = hint & 0xFFFF;
++  uint32_t destReg = (hint >> 16) & 0x1F;
++  uint32_t loadType = (hint >> 21) & 0x3;
++
++  // Displacement: pool entry address relative to inst[1] (mflr target) for the
++  // bcl path, or relative to inst[0]+4 (addpcis target = CIA+4, which is the
++  // address of inst[1]) for the addpcis path. Both conventions resolve to the
++  // same value: (pool entry) − (loadAddr + 4).
++  int32_t displacement =
++      (int32_t)((uint8_t*)constPoolAddr + index * 4 - ((uint8_t*)loadAddr + 4));
++
++  if (loadType == PoolLoadFPR64 || loadType == PoolLoadFPR32) {
++    // Three emission paths:
++    //
++    // POWER10 (preferred): plfd/plfs FRT, SI(0), R=1 — single PC-relative
++    //   prefixed FP load. 8 bytes = 2 slots; slot 2 becomes a nop. If
++    //   loadAddr % 64 == 60, plfd would straddle a 64-byte block, so emit
++    //   a leading nop at slot 0 and place plfd at slots 1-2 instead.
++    //   Reach: ±8 GB (34-bit signed). No LR clobber, no r16 base.
++    //
++    // POWER9: addpcis + lfd/lfs + nop. 2 real insns, no LR clobber, no
++    //   Return Address Stack corruption. Base register is r16.
++    //   Displacement splits into (hi << 16) + lo where lo is the 16-bit
++    //   signed D-field of lfd/lfs. Reach: ±2 GB.
++    //
++    // POWER8: bcl + mflr r16 + lfd/lfs. Same clobber + RAS caveat as before.
++    //   Kept as a correctness fallback; not exercised today because the
++    //   loadConstantDouble/Float32 wrappers skip the pool on POWER8.
++    //
++    // lfs/plfs (32-bit) auto-expand their result to double-precision in the
++    // FPR, replacing the non-pool path's separate xscvspdpn step.
++    uint32_t baseReg = SavedScratchRegister.code();
++    uint32_t loadOp = (loadType == PoolLoadFPR64) ? PPC_lfd : PPC_lfs;
++
++    if (HasPOWER10()) {
++      // MLS prefixed FP load. plfd suffix opcode = 50, plfs = 48. Same
++      // alignment-driven slot placement as PoolLoadSimd128 above.
++      uint64_t loadAddrBits = reinterpret_cast<uint64_t>(loadAddr);
++      // loadAddr is the buffer-time pointer; the final executable base is
++      // only 16-byte aligned, so the unsafe straddle is when
++      // (loadAddrBits & 15) == 12 (matches ensurePrefixedAlignment above).
++      bool needLeadingNop = (loadAddrBits & 15) == 12;
++      int prefixSlot = needLeadingNop ? 1 : 0;
++      int prefixByteOffset = prefixSlot * 4;
++      int64_t SI = int64_t(displacement) + 4 - prefixByteOffset;
++      MOZ_ASSERT(SI >= -(int64_t(1) << 33) && SI < (int64_t(1) << 33));
++      uint32_t d0 = uint32_t((uint64_t(SI) >> 16) & 0x3FFFFu);
++      uint32_t d1 = uint32_t(uint64_t(SI) & 0xFFFFu);
++      // Type 2 (MLS), R=1, RA=0.
++      uint32_t prefix =
++          (1u << 26) | (2u << 24) | (1u << 20) | (d0 & 0x3FFFFu);
++      uint32_t suffixOp = (loadType == PoolLoadFPR64) ? 50u : 48u;
++      uint32_t suffix = (suffixOp << 26) | (destReg << 21) | d1;
++
++      if (needLeadingNop) {
++        inst[0] = NopInst;
++        inst[1] = prefix;
++        inst[2] = suffix;
++      } else {
++        inst[0] = prefix;
++        inst[1] = suffix;
++        inst[2] = NopInst;
++      }
++    } else if (HasPOWER9()) {
++      // Split displacement into addpcis hi field and lfd/lfs lo field so that
++      //   target = (CIA + 4) + (hi << 16) + SEXT16(lo).
++      // Only 2 slots are reserved on P9 (loadFromPoolFloat{32,64} above);
++      // do NOT touch inst[2], it belongs to the next entry.
++      int16_t lo = (int16_t)(displacement & 0xFFFF);
++      int32_t hiAdj = displacement - lo;
++      MOZ_ASSERT((hiAdj & 0xFFFF) == 0);
++      int32_t hi = hiAdj >> 16;
++      MOZ_ASSERT(hi >= -32768 && hi <= 32767);
++      // [0] addpcis r16, hi
++      uint32_t Dhi = uint16_t(hi);
++      inst[0] = (19u << 26) | (baseReg << 21) | ((Dhi >> 1) & 0x1F) << 16 |
++                ((Dhi >> 6) & 0x3FF) << 6 | (2u << 1) | (Dhi & 1u);
++      // [1] lfd/lfs fD, lo(r16)
++      inst[1] = loadOp | (destReg << 21) | (baseReg << 16) | (uint16_t(lo));
++    } else {
++      MOZ_ASSERT(displacement >= -32768 && displacement < 32768);
++      // [0] bcl 20,0,$+4
++      inst[0] = PPC_bcl_always_plus4;
++      // [1] mflr r16
++      inst[1] = PPC_mfspr | (baseReg << 21) | PPC_SPR(spr_lr);
++      // [2] lfd/lfs fD, displacement(r16)
++      inst[2] =
++          loadOp | (destReg << 21) | (baseReg << 16) | (displacement & 0xFFFF);
++    }
++  } else if (loadType == PoolLoadSimd128) {
++    // Three emission paths (5 slots reserved by loadFromPoolSimd128):
++    //
++    // POWER10 (preferred): plxv vsD, SI(0), R=1 — single PC-relative
++    //   prefixed load, natural-LE byte order (no xxpermdi needed). 8 bytes
++    //   = 2 slots; slots 2-4 become nops. If the prefix would straddle a
++    //   64-byte block (loadAddr % 64 == 60), emit a leading nop at slot 0
++    //   and place plxv at slots 1-2 instead. Reach: ±8 GB (34-bit signed).
++    //
++    // POWER9: addpcis-equivalent via bcl + mflr + addi + lxvx + nop. 5
++    //   real insns, natural LE.
++    //
++    // POWER8: same prelude + lxvd2x + xxpermdi (BE-DW byte-swap fixup).
++    //
++    // See PoolLoadFPR64 above for why r16 instead of r12.
++    MOZ_ASSERT(displacement >= -32768 && displacement < 32768);
++    // Simd128 dest is in VR-namespace (encoding 32-63). Hint stores only
++    // the low 5 bits (loadFromPoolSimd128 masks); we set TX unconditionally
++    // since PoolLoadSimd128 always targets a Simd128.
++    constexpr uint32_t kTX = 1u;
++    constexpr uint32_t kAxBxTx_xxpermdi = (1u << 2) | (1u << 1) | 1u;
++
++    if (HasPOWER10()) {
++      // Place plxv prefix at the highest 4-byte-aligned offset within
++      // the 5 reserved slots that doesn't straddle a 64-byte block.
++      uint64_t loadAddrBits = reinterpret_cast<uint64_t>(loadAddr);
++      // loadAddr is the buffer-time pointer; the final executable base is
++      // only 16-byte aligned, so the unsafe straddle is when
++      // (loadAddrBits & 15) == 12 (matches ensurePrefixedAlignment above).
++      bool needLeadingNop = (loadAddrBits & 15) == 12;
++      int prefixSlot = needLeadingNop ? 1 : 0;
++      int prefixByteOffset = prefixSlot * 4;
++      // SI = (pool entry addr) - (prefix addr)
++      //    = (loadAddr + 4 + displacement) - (loadAddr + prefixByteOffset)
++      //    = displacement + 4 - prefixByteOffset
++      int64_t SI = int64_t(displacement) + 4 - prefixByteOffset;
++      MOZ_ASSERT(SI >= -(int64_t(1) << 33) && SI < (int64_t(1) << 33));
++      uint32_t d0 = uint32_t((uint64_t(SI) >> 16) & 0x3FFFFu);
++      uint32_t d1 = uint32_t(uint64_t(SI) & 0xFFFFu);
++      // Prefix: primary opcode 1, Type 0 (8LS), R=1, d0 at LE bits 17..0.
++      uint32_t prefix =
++          (1u << 26) | (0u << 24) | (1u << 20) | (d0 & 0x3FFFFu);
++      // Suffix: 5-bit opcode 25 at LE 31..27, TX at LE 26, T at LE 25..21,
++      //         RA=0 at LE 20..16, d1 at LE 15..0.
++      uint32_t suffix = (25u << 27) | (kTX << 26) | (destReg << 21) | d1;
++
++      // P10 reserves 3 slots; only inst[0..2] are written. Slots 3..4
++      // belong to the next pool entry on P10.
++      if (needLeadingNop) {
++        inst[0] = NopInst;
++        inst[1] = prefix;
++        inst[2] = suffix;
++      } else {
++        inst[0] = prefix;
++        inst[1] = suffix;
++        inst[2] = NopInst;
++      }
++    } else if (HasPOWER9()) {
++      // addpcis + addi + lxvx (3 slots) — no LR clobber, no RAS hazard.
++      // Same displacement split as the FP scalar P9 path: target =
++      // (CIA+4) + (hi << 16) + SEXT16(lo). lxvx is X-form indexed (no
++      // immediate offset), so combine the low 16 bits into r16 via addi
++      // before the load.
++      int16_t lo = (int16_t)(displacement & 0xFFFF);
++      int32_t hiAdj = displacement - lo;
++      MOZ_ASSERT((hiAdj & 0xFFFF) == 0);
++      int32_t hi = hiAdj >> 16;
++      MOZ_ASSERT(hi >= -32768 && hi <= 32767);
++      uint32_t Dhi = uint16_t(hi);
++      uint32_t baseReg = SavedScratchRegister.code();
++      // [0] addpcis r16, hi
++      inst[0] = (19u << 26) | (baseReg << 21) | ((Dhi >> 1) & 0x1F) << 16 |
++                ((Dhi >> 6) & 0x3FF) << 6 | (2u << 1) | (Dhi & 1u);
++      // [1] addi r16, r16, lo
++      inst[1] = PPC_addi | (baseReg << 21) | (baseReg << 16) | uint16_t(lo);
++      // [2] lxvx vsD, 0, r16  (XT[0:4] in bits 21-25, TX at bit 0)
++      inst[2] = PPC_lxvx | (destReg << 21) | (baseReg << 11) | kTX;
++    } else {
++      // P8 fallback: bcl + mflr + addi + lxvd2x + xxpermdi (5 slots).
++      // Clobbers LR; correctness-only path.
++      uint32_t baseReg = SavedScratchRegister.code();
++      inst[0] = PPC_bcl_always_plus4;
++      inst[1] = PPC_mfspr | (baseReg << 21) | PPC_SPR(spr_lr);
++      inst[2] = PPC_addi | (baseReg << 21) | (baseReg << 16) |
++                (displacement & 0xFFFF);
++      // lxvd2x XT, RA=0, RB=r16 — loads in BE order on LE.
++      inst[3] = PPC_lxvd2x | (destReg << 21) | (baseReg << 11) | kTX;
++      // xxpermdi XT, XT, XT, 2 — swap doublewords for LE byte order.
++      inst[4] = PPC_xxpermdi | (destReg << 21) | (destReg << 16) |
++                (destReg << 11) | (2u << 8) | kAxBxTx_xxpermdi;
++    }
++  } else {
++    MOZ_CRASH("PatchConstantPoolLoad: unsupported load type");
++  }
++
++  return false;
++}
++
++/* static */
++void Assembler::WritePoolGuard(BufferOffset branch, Instruction* inst,
++                               BufferOffset dest) {
++  // Emit an unconditional branch over the pool data.
++  int32_t offset = dest.getOffset() - branch.getOffset();
++  MOZ_ASSERT(JOffImm26::IsInRange(offset));
++  inst->setData(PPC_b | (offset & 0x03FFFFFC));
++}
++
++/* static */
++void Assembler::WritePoolHeader(uint8_t* start, Pool* p, bool isNatural) {
++  // Write pool identification header.
++  // Encode pool size and isNatural flag in a single 32-bit word.
++  uint32_t poolSize = p->getPoolSize();
++  uint32_t sizeInWords = (poolSize + 4 + 3) >> 2;  // header + data, in words
++  MOZ_ASSERT(sizeInWords < (1 << 15));
++  uint32_t header = (sizeInWords & 0x7FFF) | (isNatural ? (1 << 15) : 0) |
++                    0xFFFF0000;  // sentinel
++  *(uint32_t*)start = header;
++}
++
++/* static */
++void Assembler::PatchShortRangeBranchToVeneer(PPCBuffer*, unsigned rangeIdx,
++                                              BufferOffset deadline,
++                                              BufferOffset veneer) {
++  // PPC64 does not use short-range branch tracking (NumShortBranchRanges = 0).
++  MOZ_CRASH("PatchShortRangeBranchToVeneer: should not be called");
++}
++
++// Two stanza shapes share the same 8-slot footprint and the same .quad
++// location at slots [6..7] (so ExtractLoad64Value / UpdateLoad64Value are
++// shape-agnostic):
++//
++//   POWER8 (no addpcis):
++//     [0] mflr r0
++//     [1] bcl 20,0,.+4         (LR := pc of [2])
++//     [2] mflr rD
++//     [3] mtlr r0
++//     [4] ld rD, 16(rD)
++//     [5] b .+12
++//     [6..7] .quad VALUE
++//
++//   POWER9+ (addpcis):
++//     [0] addpcis rD, 0        (rD := NIA = pc of [1])
++//     [1] ld rD, 20(rD)        (rD := mem[pc_of_[1] + 20] = mem[slot[6]])
++//     [2] b .+24
++//     [3..5] NOP, NOP, NOP
++//     [6..7] .quad VALUE
++//
++// The P9+ form drops the bcl/mflr/mtlr LR-bounce (no RAS thrash) and runs
++// 2 dynamic insns instead of 6. Distinguished at patch time by inst[0]'s
++// major opcode: 31 = mfspr (P8) vs 19 = addpcis (P9+).
++static bool IsAddpcisLoad64Stanza(uint32_t enc0) {
++  return ((enc0 >> 26) & 0x3f) == 19;
++}
++
++// Extract the destination register from a load64 stanza in either shape.
++// P8 stores rD in `mflr rD` at slot [2]; P9+ stores rD in `addpcis rD, 0`
++// at slot [0]. Both encode RT at LE bits [21..25].
++static uint8_t Load64StanzaDestReg(Instruction* inst0) {
++  if (IsAddpcisLoad64Stanza(inst0->encode())) {
++    return (inst0[0].encode() >> 21) & 0x1f;
++  }
++  return (inst0[2].encode() >> 21) & 0x1f;
++}
++
++/* static */
++void Assembler::WriteLoad64Instructions(Instruction* inst0, Register reg,
++                                        uint64_t value) {
++  Instruction* i1 = inst0->next();
++  Instruction* i2 = i1->next();
++  Instruction* i3 = i2->next();
++  Instruction* i4 = i3->next();
++  Instruction* i5 = i4->next();
++  Instruction* i6 = i5->next();
++  Instruction* i7 = i6->next();
++
++  if (HasPOWER9()) {
++    // [0] addpcis rD, 0   (DX-form: opcode=19, XO=2, all D fields = 0)
++    inst0->setData(0x4C000004u | (uint32_t(reg.code()) << 21));
++    // [1] ld rD, 20(rD)   (rD := *(slot[1] + 20) = *(slot[6]) = .quad)
++    i1->setData(PPC_ld | (uint32_t(reg.code()) << 21) |
++                (uint32_t(reg.code()) << 16) | 20);
++    // [2] b .+24          (skip slots [3..7] to land at slot [8])
++    i2->setData(PPC_b | (24 & 0x03FFFFFC));
++    // [3..5] NOP filler — unreachable but kept aligned for the patcher.
++    i3->setData(NopInst);
++    i4->setData(NopInst);
++    i5->setData(NopInst);
++  } else {
++    // [0] mflr r0
++    inst0->setData(PPC_mfspr | (r0.code() << 21) | PPC_SPR(spr_lr));
++    // [1] bcl 20,0,.+4
++    i1->setData(PPC_bcl_always_plus4);
++    // [2] mflr rD
++    i2->setData(PPC_mfspr | (reg.code() << 21) | PPC_SPR(spr_lr));
++    // [3] mtlr r0
++    i3->setData(PPC_mtspr | (r0.code() << 21) | PPC_SPR(spr_lr));
++    // [4] ld rD, 16(rD)
++    i4->setData(PPC_ld | (reg.code() << 21) | (reg.code() << 16) | 16);
++    // [5] b .+12
++    i5->setData(PPC_b | (12 & 0x03FFFFFC));
++  }
++
++  // [6..7] .quad VALUE (low 32 at lower addr, high 32 at higher addr).
++  i6->setData((uint32_t)(value & 0xFFFFFFFF));
++  i7->setData((uint32_t)(value >> 32));
++}
++
++/* static */
++uint64_t Assembler::ExtractLoad64Value(Instruction* inst0) {
++  // The 8-byte value is at inst0[6..7] in both shapes.
++  Instruction* i6 = inst0 + 6;
++  Instruction* i7 = inst0 + 7;
++
++  uint64_t lo = (uint64_t)i6->encode();  // low 32 at lower addr
++  uint64_t hi = (uint64_t)i7->encode();  // high 32 at higher addr
++  return (hi << 32) | lo;
++}
++
++/* static */
++void Assembler::UpdateLoad64Value(Instruction* inst0, uint64_t value) {
++  // Sanity-check that inst0 is the start of a load64 stanza in either shape.
++  // P8: inst0[1] == bcl 20,0,.+4. P9+: inst0[0] is addpcis (major opcode 19).
++  MOZ_ASSERT(inst0[1].encode() == PPC_bcl_always_plus4 ||
++                 IsAddpcisLoad64Stanza(inst0->encode()),
++             "UpdateLoad64Value: inst0 is not a load64 stanza");
++
++  // .quad lives at inst0[6..7] in both shapes.
++  Instruction* i6 = inst0 + 6;
++  Instruction* i7 = inst0 + 7;
++
++  i6->setData((uint32_t)(value & 0xFFFFFFFF));  // low 32 at lower addr
++  i7->setData((uint32_t)(value >> 32));         // high 32 at higher addr
++}
++
++// ========================================================================
++// Patching and toggle operations.
++// ========================================================================
++
++/* static */
++uint32_t Assembler::PatchWrite_NearCallSize() {
++  // 8 instructions for Load64 + mtctr + bctrl = 10 instructions.
++  return 10 * sizeof(uint32_t);
++}
++
++/* static */
++void Assembler::PatchWrite_NearCall(CodeLocationLabel start,
++                                    CodeLocationLabel toCall) {
++  Instruction* inst = (Instruction*)start.raw();
++  uint8_t* dest = toCall.raw();
++
++  Assembler::WriteLoad64Instructions(inst, SavedScratchRegister,
++                                     (uint64_t)dest);
++  inst[8].makeOp_mtctr(SavedScratchRegister);
++  inst[9].makeOp_bctr(LinkB);
++  FlushICache(inst, 10 * sizeof(Instruction));
++}
++
++/* static */
++void Assembler::PatchWrite_Imm32(CodeLocationLabel label, Imm32 imm) {
++  uint32_t* l = (uint32_t*)label.raw();
++  *(l - 1) = imm.value;
++  FlushICache(l - 1, sizeof(uint32_t));
++}
++
++void Assembler::PatchDataWithValueCheck(CodeLocationLabel label,
++                                        ImmPtr newValue, ImmPtr expectedValue) {
++  PatchDataWithValueCheck(label, PatchedImmPtr(newValue.value),
++                          PatchedImmPtr(expectedValue.value));
++}
++
++void Assembler::PatchDataWithValueCheck(CodeLocationLabel label,
++                                        PatchedImmPtr newValue,
++                                        PatchedImmPtr expectedValue) {
++  Instruction* inst = (Instruction*)label.raw();
++
++  DebugOnly<uint64_t> value = Assembler::ExtractLoad64Value(inst);
++  MOZ_ASSERT(value == uint64_t(expectedValue.value));
++
++  Assembler::UpdateLoad64Value(inst, uint64_t(newValue.value));
++  FlushICache(inst, 8 * sizeof(Instruction));
++}
++
++// ToggleCall toggles the call portion of a toggledCall stanza.
++// Layout: 8 load64 instructions + mtctr + bctrl (10 total).
++// We toggle the last two instructions (mtctr/bctrl vs nop/nop).
++// The destination register is extracted via Load64StanzaDestReg, which
++// handles both the P8 (mflr-rD at slot [2]) and P9+ (addpcis-rD at slot
++// [0]) shapes.
++
++/* static */
++void Assembler::ToggleCall(CodeLocationLabel inst_, bool enabled) {
++  Instruction* i0 = (Instruction*)inst_.raw();
++  Instruction* i8 = (Instruction*)(inst_.raw() + 8 * sizeof(uint32_t));
++  Instruction* i9 = (Instruction*)(inst_.raw() + 9 * sizeof(uint32_t));
++
++  // Accept either P8 stanza (mflr r0 at slot [0]) or P9+ stanza (addpcis at
++  // slot [0]; major opcode 19).
++  MOZ_ASSERT(i0->encode() == (PPC_mfspr | (r0.code() << 21) | PPC_SPR(spr_lr)) ||
++                 IsAddpcisLoad64Stanza(i0->encode()));
++
++  // ToggleCall is idempotent across the same `enabled` value: re-enabling
++  // an already-enabled site (or re-disabling a disabled one) is a no-op.
++  // Mozilla's debugger machinery may legitimately toggle the same call site
++  // multiple times in the same direction (e.g. setting both a breakpoint
++  // and a frame.onStep on the same script).
++  Register scratch = Register::FromCode(Load64StanzaDestReg(i0));
++  uint32_t mtctr = PPC_mtspr | (scratch.code() << 21) | PPC_SPR(spr_ctr);
++  uint32_t bctrl = (uint32_t)PPC_bctr | (uint32_t)LinkB;
++  if (enabled) {
++    MOZ_ASSERT(i8->encode() == NopInst || i8->encode() == mtctr);
++    MOZ_ASSERT(i9->encode() == NopInst || i9->encode() == bctrl);
++    i8->setData(mtctr);
++    i9->setData(bctrl);
++  } else {
++    MOZ_ASSERT(i8->encode() == NopInst || i8->encode() == mtctr);
++    MOZ_ASSERT(i9->encode() == NopInst || i9->encode() == bctrl);
++    i8->setData(NopInst);
++    i9->setData(NopInst);
++  }
++  FlushICache(i8, 2 * sizeof(Instruction));
++}
++
++// toggledJump emits a trap stanza via jump(label). After binding, the first
++// instruction becomes "b offset" (short branch). We toggle between b and ori:
++//   b offset:       [010010][LI:24][0][0]
++//   ori r0,r0,imm:  [011000][00000][00000][UI:16]
++// For short forward jumps (offset < 64KB), bits 25:16 of LI are 0, so
++// swapping the opcode preserves the offset in the lower 16 bits.
++// ori r0,r0,X is effectively a nop (writes to r0).
++
++/* static */
++void Assembler::ToggleToJmp(CodeLocationLabel inst_) {
++  Instruction* inst = (Instruction*)inst_.raw();
++  MOZ_ASSERT(inst->isOpcode(PPC_ori));
++  // Verify RS=0 and RA=0 (r0).
++  MOZ_ASSERT((inst->encode() & 0x03E00000) == 0);
++  MOZ_ASSERT((inst->encode() & 0x001F0000) == 0);
++  // Swap opcode from ori (011000) to b (010010).
++  uint32_t encoding = inst->encode();
++  encoding = (encoding & 0x03FFFFFF) | (uint32_t)PPC_b;
++  inst->setData(encoding);
++  FlushICache(inst, sizeof(Instruction));
++}
++
++/* static */
++void Assembler::ToggleToCmp(CodeLocationLabel inst_) {
++  Instruction* inst = (Instruction*)inst_.raw();
++  MOZ_ASSERT(inst->isOpcode(PPC_b));
++  // Verify short forward branch: upper LI bits (25:16) are 0, AA=0, LK=0.
++  MOZ_ASSERT((inst->encode() & 0x03FF0003) == 0);
++  // Swap opcode from b (010010) to ori (011000).
++  uint32_t encoding = inst->encode();
++  encoding = (encoding & 0x03FFFFFF) | (uint32_t)PPC_ori;
++  inst->setData(encoding);
++  FlushICache(inst, sizeof(Instruction));
++}
++
++// ========================================================================
++// Bind, tracing, and pointer extraction.
++// ========================================================================
++
++void Assembler::Bind(uint8_t* rawCode, const CodeLabel& label) {
++  if (label.patchAt().bound()) {
++    auto mode = label.linkMode();
++    intptr_t offset = label.patchAt().offset();
++    intptr_t target = label.target().offset();
++
++    if (mode == CodeLabel::RawPointer) {
++      *reinterpret_cast<const void**>(rawCode + offset) = rawCode + target;
++    } else {
++      MOZ_ASSERT(mode == CodeLabel::MoveImmediate ||
++                 mode == CodeLabel::JumpImmediate);
++      Instruction* inst = (Instruction*)(rawCode + offset);
++      Assembler::UpdateLoad64Value(inst, (uint64_t)(rawCode + target));
++    }
++  }
++}
++
++uintptr_t Assembler::GetPointer(uint8_t* instPtr) {
++  Instruction* inst = (Instruction*)instPtr;
++  return Assembler::ExtractLoad64Value(inst);
++}
++
++static JitCode* CodeFromJump(Instruction* jump) {
++  uint8_t* target = (uint8_t*)Assembler::ExtractLoad64Value(jump);
++  return JitCode::FromExecutable(target);
++}
++
++void Assembler::TraceJumpRelocations(JSTracer* trc, JitCode* code,
++                                     CompactBufferReader& reader) {
++  while (reader.more()) {
++    JitCode* child =
++        CodeFromJump((Instruction*)(code->raw() + reader.readUnsigned()));
++    TraceManuallyBarrieredEdge(trc, &child, "rel32");
++  }
++}
++
++static void TraceOneDataRelocation(JSTracer* trc,
++                                   mozilla::Maybe<AutoWritableJitCode>& awjc,
++                                   JitCode* code, Instruction* inst) {
++  void* ptr = (void*)Assembler::ExtractLoad64Value(inst);
++  void* prior = ptr;
++
++  uintptr_t word = reinterpret_cast<uintptr_t>(ptr);
++  if (word >> JSVAL_TAG_SHIFT) {
++    Value v = Value::fromRawBits(word);
++    TraceManuallyBarrieredEdge(trc, &v, "jit-masm-value");
++    ptr = (void*)v.bitsAsPunboxPointer();
++  } else {
++    TraceManuallyBarrieredGenericPointerEdge(
++        trc, reinterpret_cast<gc::Cell**>(&ptr), "jit-masm-ptr");
++  }
++
++  if (ptr != prior) {
++    if (awjc.isNothing()) {
++      awjc.emplace(code);
++    }
++    Assembler::UpdateLoad64Value(inst, uint64_t(ptr));
++  }
++}
++
++/* static */
++void Assembler::TraceDataRelocations(JSTracer* trc, JitCode* code,
++                                     CompactBufferReader& reader) {
++  mozilla::Maybe<AutoWritableJitCode> awjc;
++  while (reader.more()) {
++    size_t offset = reader.readUnsigned();
++    Instruction* inst = (Instruction*)(code->raw() + offset);
++    TraceOneDataRelocation(trc, awjc, code, inst);
++  }
++}
++
++/* static */
++uint8_t* Assembler::NextInstruction(uint8_t* instruction, uint32_t* count) {
++  if (count != nullptr) {
++    *count += sizeof(Instruction);
++  }
++  return instruction + sizeof(Instruction);
++}
++
++// ========================================================================
++// UseScratchRegisterScope implementation.
++// ========================================================================
++
++UseScratchRegisterScope::UseScratchRegisterScope(Assembler& assembler)
++    : available_(assembler.GetScratchRegisterList()),
++      old_available_(*available_) {}
++
++UseScratchRegisterScope::UseScratchRegisterScope(Assembler* assembler)
++    : available_(assembler->GetScratchRegisterList()),
++      old_available_(*available_) {}
++
++UseScratchRegisterScope::~UseScratchRegisterScope() {
++  *available_ = old_available_;
++}
++
++Register UseScratchRegisterScope::Acquire() {
++  MOZ_ASSERT(available_ != nullptr);
++  MOZ_ASSERT(!available_->empty());
++  Register index = GeneralRegisterSet::FirstRegister(available_->bits());
++  available_->takeRegisterIndex(index);
++  return index;
++}
++
++void UseScratchRegisterScope::Release(const Register& reg) {
++  MOZ_ASSERT(available_ != nullptr);
++  MOZ_ASSERT(old_available_.hasRegisterIndex(reg));
++  MOZ_ASSERT(!available_->hasRegisterIndex(reg));
++  Include(GeneralRegisterSet(1 << reg.code()));
++}
++
++bool UseScratchRegisterScope::hasAvailable() const {
++  return (available_->size()) != 0;
++}
+diff --git a/js/src/jit/ppc64/Assembler-ppc64.h b/js/src/jit/ppc64/Assembler-ppc64.h
+new file mode 100644
+index 000000000000..60e84bf71cf7
+--- /dev/null
++++ b/js/src/jit/ppc64/Assembler-ppc64.h
+@@ -0,0 +1,2114 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_Assembler_ppc64_h
++#define jit_ppc64_Assembler_ppc64_h
++
++#include "jit/CompactBuffer.h"
++#include "jit/JitCode.h"
++#include "jit/JitSpewer.h"
++#include "jit/ppc64/Architecture-ppc64.h"
++#include "jit/shared/Assembler-shared.h"
++#include "jit/shared/Disassembler-shared.h"
++#include "jit/shared/IonAssemblerBuffer.h"
++#include "jit/shared/IonAssemblerBufferWithConstantPools.h"
++#include "wasm/WasmTypeDecls.h"
++
++namespace js {
++namespace jit {
++
++// GPR register constants.
++static constexpr Register r0{Registers::r0};
++static constexpr Register r1{Registers::r1};
++static constexpr Register r2{Registers::r2};
++static constexpr Register r3{Registers::r3};
++static constexpr Register r4{Registers::r4};
++static constexpr Register r5{Registers::r5};
++static constexpr Register r6{Registers::r6};
++static constexpr Register r7{Registers::r7};
++static constexpr Register r8{Registers::r8};
++static constexpr Register r9{Registers::r9};
++static constexpr Register r10{Registers::r10};
++static constexpr Register r11{Registers::r11};
++static constexpr Register r12{Registers::r12};
++static constexpr Register r13{Registers::r13};
++static constexpr Register r14{Registers::r14};
++static constexpr Register r15{Registers::r15};
++static constexpr Register r16{Registers::r16};
++static constexpr Register r17{Registers::r17};
++static constexpr Register r18{Registers::r18};
++static constexpr Register r19{Registers::r19};
++static constexpr Register r20{Registers::r20};
++static constexpr Register r21{Registers::r21};
++static constexpr Register r22{Registers::r22};
++static constexpr Register r23{Registers::r23};
++static constexpr Register r24{Registers::r24};
++static constexpr Register r25{Registers::r25};
++static constexpr Register r26{Registers::r26};
++static constexpr Register r27{Registers::r27};
++static constexpr Register r28{Registers::r28};
++static constexpr Register r29{Registers::r29};
++static constexpr Register r30{Registers::r30};
++static constexpr Register r31{Registers::r31};
++
++// FPR register constants.
++static constexpr FloatRegister f0{FloatRegisters::f0, FloatRegisters::Double};
++static constexpr FloatRegister f1{FloatRegisters::f1, FloatRegisters::Double};
++static constexpr FloatRegister f2{FloatRegisters::f2, FloatRegisters::Double};
++static constexpr FloatRegister f3{FloatRegisters::f3, FloatRegisters::Double};
++static constexpr FloatRegister f4{FloatRegisters::f4, FloatRegisters::Double};
++static constexpr FloatRegister f5{FloatRegisters::f5, FloatRegisters::Double};
++static constexpr FloatRegister f6{FloatRegisters::f6, FloatRegisters::Double};
++static constexpr FloatRegister f7{FloatRegisters::f7, FloatRegisters::Double};
++static constexpr FloatRegister f8{FloatRegisters::f8, FloatRegisters::Double};
++static constexpr FloatRegister f9{FloatRegisters::f9, FloatRegisters::Double};
++static constexpr FloatRegister f10{FloatRegisters::f10, FloatRegisters::Double};
++static constexpr FloatRegister f11{FloatRegisters::f11, FloatRegisters::Double};
++static constexpr FloatRegister f12{FloatRegisters::f12, FloatRegisters::Double};
++static constexpr FloatRegister f13{FloatRegisters::f13, FloatRegisters::Double};
++static constexpr FloatRegister f14{FloatRegisters::f14, FloatRegisters::Double};
++static constexpr FloatRegister f15{FloatRegisters::f15, FloatRegisters::Double};
++static constexpr FloatRegister f16{FloatRegisters::f16, FloatRegisters::Double};
++static constexpr FloatRegister f17{FloatRegisters::f17, FloatRegisters::Double};
++static constexpr FloatRegister f18{FloatRegisters::f18, FloatRegisters::Double};
++static constexpr FloatRegister f19{FloatRegisters::f19, FloatRegisters::Double};
++static constexpr FloatRegister f20{FloatRegisters::f20, FloatRegisters::Double};
++static constexpr FloatRegister f21{FloatRegisters::f21, FloatRegisters::Double};
++static constexpr FloatRegister f22{FloatRegisters::f22, FloatRegisters::Double};
++static constexpr FloatRegister f23{FloatRegisters::f23, FloatRegisters::Double};
++static constexpr FloatRegister f24{FloatRegisters::f24, FloatRegisters::Double};
++static constexpr FloatRegister f25{FloatRegisters::f25, FloatRegisters::Double};
++static constexpr FloatRegister f26{FloatRegisters::f26, FloatRegisters::Double};
++static constexpr FloatRegister f27{FloatRegisters::f27, FloatRegisters::Double};
++static constexpr FloatRegister f28{FloatRegisters::f28, FloatRegisters::Double};
++static constexpr FloatRegister f29{FloatRegisters::f29, FloatRegisters::Double};
++static constexpr FloatRegister f30{FloatRegisters::f30, FloatRegisters::Double};
++static constexpr FloatRegister f31{FloatRegisters::f31, FloatRegisters::Double};
++
++static constexpr Register InvalidReg{Registers::Invalid};
++static constexpr FloatRegister InvalidFloatReg;
++
++static constexpr Register StackPointer = r1;
++static constexpr Register FramePointer = r31;
++static constexpr Register ReturnReg = r3;
++static constexpr Register64 ReturnReg64(ReturnReg);
++static constexpr FloatRegister ReturnFloat32Reg{FloatRegisters::f1,
++                                                FloatRegisters::Single};
++static constexpr FloatRegister ReturnDoubleReg = f1;
++static constexpr FloatRegister ReturnSimd128Reg{FloatRegisters::f1,
++                                                FloatRegisters::Simd128};
++
++// r16 is non-volatile and non-allocatable, used as a saved scratch.
++static constexpr Register SavedScratchRegister = r16;
++
++static constexpr Register SecondScratchReg = r12;
++
++static constexpr FloatRegister ScratchFloat32Reg{FloatRegisters::f0,
++                                                 FloatRegisters::Single};
++static constexpr FloatRegister ScratchDoubleReg = f0;
++static constexpr FloatRegister ScratchSimd128Reg{FloatRegisters::f0,
++                                                 FloatRegisters::Simd128};
++
++struct ScratchFloat32Scope : public AutoFloatRegisterScope {
++  explicit ScratchFloat32Scope(MacroAssembler& masm)
++      : AutoFloatRegisterScope(masm, ScratchFloat32Reg) {}
++};
++
++struct ScratchDoubleScope : public AutoFloatRegisterScope {
++  explicit ScratchDoubleScope(MacroAssembler& masm)
++      : AutoFloatRegisterScope(masm, ScratchDoubleReg) {}
++};
++
++// PPC64: ScratchSimd128Scope is a simple register wrapper, NOT a scoped
++// acquire/release. On PPC64, ScratchSimd128Reg is v0 (VSR32; encoded as
++// {FloatRegisters::f0, Simd128} so encoding() = 0 + 32 = 32) — distinct
++// from ScratchDoubleReg = f0 (VSR0). It is non-allocatable and always
++// available. Many SIMD functions call other SIMD functions that also need
++// v0, creating nested "scopes". Using AutoFloatRegisterScope would assert
++// on double-acquire in debug builds. Since v0 is never allocated by the
++// register allocator, nesting is safe.
++struct ScratchSimd128Scope : public FloatRegister {
++  explicit ScratchSimd128Scope(MacroAssembler&)
++      : FloatRegister(ScratchSimd128Reg) {}
++};
++
++class Assembler;
++
++class UseScratchRegisterScope {
++ public:
++  explicit UseScratchRegisterScope(Assembler& assembler);
++  explicit UseScratchRegisterScope(Assembler* assembler);
++  ~UseScratchRegisterScope();
++
++  Register Acquire();
++  void Release(const Register& reg);
++  bool hasAvailable() const;
++  void Include(const GeneralRegisterSet& list) {
++    *available_ = GeneralRegisterSet::Union(*available_, list);
++  }
++  void Exclude(const GeneralRegisterSet& list) {
++    *available_ = GeneralRegisterSet::Subtract(*available_, list);
++  }
++
++ private:
++  GeneralRegisterSet* available_;
++  GeneralRegisterSet old_available_;
++};
++
++static constexpr Register OsrFrameReg = r6;
++static constexpr Register PreBarrierReg = r4;
++static constexpr Register InterpreterPCReg = r17;
++
++static constexpr Register CallTempReg0 = r4;
++static constexpr Register CallTempReg1 = r9;
++static constexpr Register CallTempReg2 = r10;
++static constexpr Register CallTempReg3 = r7;
++// CallTempReg4 must NOT be JSReturnReg (r5): LMegamorphicLoadSlotPermissive
++// uses tempFixed(CallTempReg4) for a saved obj pointer AND defineReturn
++// (JSReturnOperand=r5) for output. If they alias, the megamorphic cache
++// lookup clobbers the saved obj, corrupting the 'this' pointer.
++static constexpr Register CallTempReg4 = r8;
++static constexpr Register CallTempReg5 = r6;
++
++// PPC64 ELFv2 has no volatile non-arg GPRs (r3-r10 are all arg regs).
++// Use allocatable non-volatile registers as overflow temps.
++static constexpr Register CallTempNonArgRegs[] = {r14, r15};
++static const uint32_t NumCallTempNonArgRegs = std::size(CallTempNonArgRegs);
++
++static constexpr Register IntArgReg0 = r3;
++static constexpr Register IntArgReg1 = r4;
++static constexpr Register IntArgReg2 = r5;
++static constexpr Register IntArgReg3 = r6;
++static constexpr Register IntArgReg4 = r7;
++static constexpr Register IntArgReg5 = r8;
++static constexpr Register IntArgReg6 = r9;
++static constexpr Register IntArgReg7 = r10;
++
++// Registers used by RegExpMatcher and RegExpExecMatch stubs.
++static constexpr Register RegExpMatcherRegExpReg = CallTempReg0;
++static constexpr Register RegExpMatcherStringReg = CallTempReg1;
++static constexpr Register RegExpMatcherLastIndexReg = CallTempReg2;
++
++// Registers used by RegExpExecTest stub (do not use ReturnReg).
++static constexpr Register RegExpExecTestRegExpReg = CallTempReg0;
++static constexpr Register RegExpExecTestStringReg = CallTempReg1;
++
++// Registers used by RegExpSearcher stub (do not use ReturnReg).
++static constexpr Register RegExpSearcherRegExpReg = CallTempReg0;
++static constexpr Register RegExpSearcherStringReg = CallTempReg1;
++static constexpr Register RegExpSearcherLastIndexReg = CallTempReg2;
++
++static constexpr Register JSReturnReg_Type = r6;
++static constexpr Register JSReturnReg_Data = r5;
++static constexpr Register JSReturnReg = r5;
++static constexpr ValueOperand JSReturnOperand = ValueOperand(JSReturnReg);
++
++static constexpr Register ABINonArgReg0 = r19;
++static constexpr Register ABINonArgReg1 = r20;
++static constexpr Register ABINonArgReg2 = r21;
++static constexpr Register ABINonArgReg3 = r22;
++static constexpr Register ABINonArgReturnReg0 = r29;
++static constexpr Register ABINonArgReturnReg1 = r30;
++static constexpr Register ABINonVolatileReg = r14;
++static constexpr Register ABINonArgReturnVolatileReg = r11;
++
++static constexpr FloatRegister ABINonArgDoubleReg{FloatRegisters::f14,
++                                                  FloatRegisters::Double};
++
++// Wasm instance pointer register. Preserved across wasm function calls.
++static constexpr Register InstanceReg = r18;
++static constexpr Register HeapReg = r24;
++static constexpr Register GlobalReg = r23;
++
++// Wasm table call registers.
++static constexpr Register WasmTableCallScratchReg0 = ABINonArgReg0;
++static constexpr Register WasmTableCallScratchReg1 = ABINonArgReg1;
++static constexpr Register WasmTableCallSigReg = ABINonArgReg2;
++static constexpr Register WasmTableCallIndexReg = ABINonArgReg3;
++
++// Wasm ref call registers.
++static constexpr Register WasmCallRefCallScratchReg0 = ABINonArgReg0;
++static constexpr Register WasmCallRefCallScratchReg1 = ABINonArgReg1;
++static constexpr Register WasmCallRefCallScratchReg2 = ABINonArgReg2;
++static constexpr Register WasmCallRefReg = ABINonArgReg3;
++
++// Wasm tail call scratch registers.
++// WasmTailCallRAScratchReg must NOT be ABINonArgReg0: the shared tail-call
++// code (wasmReturnCallImport, wasmReturnCallIndirect, wasmReturnCallRef)
++// stores the callee address in ABINonArgReg0, and CollapseWasmFrame*
++// overwrites tempForRA. On architectures with a GPR link register (ARM,
++// MIPS, LA64, RISC-V) this is ra/lr. PPC64's LR is an SPR, so we use r14
++// (ABINonVolatileReg) which is callee-saved and not used in call setup.
++static constexpr Register WasmTailCallInstanceScratchReg = ABINonArgReg1;
++static constexpr Register WasmTailCallRAScratchReg = ABINonVolatileReg;
++static constexpr Register WasmTailCallFPScratchReg = ABINonArgReg3;
++
++// Register used as a scratch along the return path in the fast js -> wasm stub
++// code. Must not overlap ReturnReg, JSReturnOperand, or InstanceReg.
++// Must be volatile.
++static constexpr Register WasmJitEntryReturnScratch = r10;
++
++static constexpr uint32_t ABIStackAlignment = 16;
++static constexpr uint32_t CodeAlignment = 16;
++static constexpr uint32_t JitStackAlignment = 16;
++
++static constexpr uint32_t JitStackValueAlignment =
++    JitStackAlignment / sizeof(Value);
++static_assert(JitStackAlignment % sizeof(Value) == 0 &&
++                  JitStackValueAlignment >= 1,
++              "Stack alignment should be a non-zero multiple of sizeof(Value)");
++
++static constexpr uint32_t SimdMemoryAlignment = 16;
++static_assert(
++    CodeAlignment % SimdMemoryAlignment == 0,
++    "Code alignment should be larger than any of the alignments "
++    "which are used for the constant sections of the code buffer. "
++    "Thus it should be larger than the alignment for SIMD constants.");
++
++static constexpr uint32_t WasmStackAlignment = SimdMemoryAlignment;
++static const uint32_t WasmTrapInstructionLength = 4;
++
++static constexpr uint32_t WasmCheckedCallEntryOffset = 0u;
++static constexpr uint32_t WasmCheckedTailEntryOffset = 32u;
++
++static constexpr Scale ScalePointer = TimesEight;
++
++class ABIArgGenerator : public ABIArgGeneratorShared {
++ public:
++  explicit ABIArgGenerator(ABIKind kind)
++      : ABIArgGeneratorShared(kind),
++        intRegIndex_(0),
++        floatRegIndex_(0),
++        current_() {
++    // PPC64 ELFv2 ABI: the callee saves LR, CR, TOC into the caller's
++    // frame (offsets 8, 16, 24 from caller SP). Reserve 32 bytes so that
++    // callWithABIPre always allocates enough space for this link area.
++    stackOffset_ += ShadowStackSpace;
++  }
++
++  ABIArg next(MIRType argType);
++  ABIArg& current() { return current_; }
++
++ protected:
++  unsigned intRegIndex_;
++  unsigned floatRegIndex_;
++  ABIArg current_;
++};
++
++static constexpr uint32_t NumIntArgRegs = 8;
++static constexpr uint32_t NumFloatArgRegs = 13;
++
++static inline bool GetIntArgReg(uint32_t usedIntArgs, Register* out) {
++  if (usedIntArgs < NumIntArgRegs) {
++    *out = Register::FromCode(r3.code() + usedIntArgs);
++    return true;
++  }
++  return false;
++}
++
++static inline bool GetFloatArgReg(uint32_t usedFloatArgs, FloatRegister* out) {
++  if (usedFloatArgs < NumFloatArgRegs) {
++    *out = FloatRegister::FromCode(f1.code() + usedFloatArgs);
++    return true;
++  }
++  return false;
++}
++
++static inline bool GetTempRegForIntArg(uint32_t usedIntArgs,
++                                       uint32_t usedFloatArgs, Register* out) {
++  MOZ_ASSERT(usedFloatArgs == 0);
++
++  if (GetIntArgReg(usedIntArgs, out)) {
++    return true;
++  }
++
++  usedIntArgs -= NumIntArgRegs;
++  if (usedIntArgs >= NumCallTempNonArgRegs) {
++    return false;
++  }
++  *out = CallTempNonArgRegs[usedIntArgs];
++  return true;
++}
++
++// PPC64 instruction field positions.
++// PPC uses big-endian bit numbering (bit 0 = MSB), but we store instructions
++// in a uint32_t where bit 0 = LSB. The shifts below are in LSB-0 terms.
++//
++//   [0:5]  primary opcode   (OpcodeShift=26)
++//   [6:10]  RT/RS/BF/TO     (RTShift=21, 5 bits)
++//   [11:15] RA/BI           (RAShift=16, 5 bits)
++//   [16:20] RB/SH           (RBShift=11, 5 bits)
++//   [16:31] SI/UI/D         (Imm16Shift=0, 16 bits)
++//   [21:25] subop bits      (varies)
++//   [21:30] XO              (X-form; A/M/MD/MDS narrower)
++//   [31]    Rc bit          (RcShift=0)
++
++static const uint32_t OpcodeShift = 26;
++static const uint32_t OpcodeBits = 6;
++
++static const uint32_t RTShift = 21;
++static const uint32_t RTBits = 5;
++static const uint32_t RSShift = 21;
++static const uint32_t RSBits = 5;
++static const uint32_t RAShift = 16;
++static const uint32_t RABits = 5;
++static const uint32_t RBShift = 11;
++static const uint32_t RBBits = 5;
++static const uint32_t RCShift = 6;
++static const uint32_t RCBits = 5;
++
++static const uint32_t BOShift = 21;
++static const uint32_t BOBits = 5;
++static const uint32_t BIShift = 16;
++static const uint32_t BIBits = 5;
++
++static const uint32_t Imm16Shift = 0;
++static const uint32_t Imm16Bits = 16;
++
++static const uint32_t RcShift = 0;
++static const uint32_t RcBit = 1;
++
++static const uint32_t RTMask = ((1 << RTBits) - 1) << RTShift;
++static const uint32_t RSMask = ((1 << RSBits) - 1) << RSShift;
++static const uint32_t RAMask = ((1 << RABits) - 1) << RAShift;
++static const uint32_t RBMask = ((1 << RBBits) - 1) << RBShift;
++static const uint32_t Imm16Mask = (1 << Imm16Bits) - 1;
++static const uint32_t RegMask = (1 << RTBits) - 1;
++
++static inline uint32_t RT(Register r) { return (uint32_t)r.code() << RTShift; }
++static inline uint32_t RT(FloatRegister r) {
++  return (uint32_t)r.code() << RTShift;
++}
++static inline uint32_t RS(Register r) { return (uint32_t)r.code() << RSShift; }
++static inline uint32_t RS(FloatRegister r) {
++  return (uint32_t)r.code() << RSShift;
++}
++static inline uint32_t RA(Register r) { return (uint32_t)r.code() << RAShift; }
++static inline uint32_t RA(FloatRegister r) {
++  return (uint32_t)r.code() << RAShift;
++}
++static inline uint32_t RB(Register r) { return (uint32_t)r.code() << RBShift; }
++static inline uint32_t RB(FloatRegister r) {
++  return (uint32_t)r.code() << RBShift;
++}
++
++// SPR encoding: the SPR number is split across bits 11-15 and 16-20 in a
++// swapped arrangement.  PPC_SPR(x) produces the value to OR into an
++// mtspr/mfspr instruction at the RB+RA position (bits 11-20).
++#define PPC_SPR(x) ((((int)(x) >> 5) & 0x1f) << 11 | ((int)(x) & 0x1f) << 16)
++
++enum PPCOpcodes {
++  PPC_add = 0x7C000214,
++  PPC_addc = 0x7C000014,
++  PPC_adde = 0x7C000114,
++  PPC_addi = 0x38000000,
++  PPC_addis = 0x3C000000,
++  PPC_and_ = 0x7C000038,
++  // andi. is always record form (no non-record andi exists).
++  PPC_andi_dot = 0x70000000,
++  PPC_b = 0x48000000,
++  PPC_bc = 0x40000000,
++  // Encoded "bcl 20, lt, $+4": PC-relative branch-and-link by 4 bytes
++  // (land at the next instruction) with BO=20 (branch always); BI=0
++  // (=lt) is don't-care because BO=20 forces the branch. Used by
++  // PoolLoadFPR{32,64}'s POWER8 stanza and PoolLoadSimd128's stanza to
++  // seed LR with the current PC for the subsequent mflr+ld base
++  // computation. Used by patch sites that write raw instruction memory
++  // (PatchConstantPoolLoad, WriteLoad64Instructions, etc.). Named for
++  // grep-ability and to avoid magic-number copies.
++  PPC_bcl_always_plus4 = 0x42800005,
++  PPC_bctr = 0x4E800420,
++  PPC_bcctr = 0x4C000420,
++  PPC_blr = 0x4E800020,
++  PPC_cmpd = 0x7C200000,
++  PPC_cmpdi = 0x2C200000,
++  PPC_cmpld = 0x7C200040,
++  PPC_cmpldi = 0x28200000,
++  PPC_cmpw = 0x7C000000,
++  PPC_cmpwi = 0x2C000000,
++  PPC_cmplw = 0x7C000040,
++  PPC_cmplwi = 0x28000000,
++  PPC_cntlzd = 0x7C000074,
++  PPC_cntlzw = 0x7C000034,
++  PPC_cnttzd = 0x7C000474,
++  PPC_cnttzw = 0x7C000434,
++  PPC_crandc = 0x4C000102,
++  PPC_cror = 0x4C000382,
++  PPC_crorc = 0x4C000342,
++  PPC_divd = 0x7C0003D2,
++  PPC_divdu = 0x7C000392,
++  PPC_divw = 0x7C0003D6,
++  PPC_divwu = 0x7C000396,
++  // POWER9 (ISA 3.0) modulo instructions.
++  PPC_modsd = 0x7C000612,
++  PPC_modsw = 0x7C000616,
++  PPC_modud = 0x7C000212,
++  PPC_moduw = 0x7C000216,
++  PPC_extsb = 0x7C000774,
++  PPC_extsh = 0x7C000734,
++  PPC_extsw = 0x7C0007B4,
++  PPC_fabs = 0xFC000210,
++  PPC_fadd = 0xFC00002A,
++  PPC_fadds = 0xEC00002A,
++  PPC_fcpsgn = 0xFC000010,
++  PPC_fcfid = 0xFC00069C,
++  PPC_fcfids = 0xEC00069C,
++  PPC_fcfidu = 0xFC00079C,
++  PPC_fcfidus = 0xEC00079C,
++  PPC_fcmpu = 0xFC000000,
++  PPC_fctid = 0xFC00065C,
++  PPC_fctidz = 0xFC00065E,
++  PPC_fctiduz = 0xFC00075E,
++  PPC_fctiwz = 0xFC00001E,
++  PPC_fdiv = 0xFC000024,
++  PPC_fdivs = 0xEC000024,
++  PPC_fmr = 0xFC000090,
++  PPC_fmul = 0xFC000032,
++  PPC_fmuls = 0xEC000032,
++  PPC_fneg = 0xFC000050,
++  PPC_frim = 0xFC0003D0,
++  PPC_frip = 0xFC000390,
++  PPC_friz = 0xFC000350,
++  PPC_frsp = 0xFC000018,
++  PPC_fsub = 0xFC000028,
++  PPC_fsubs = 0xEC000028,
++  PPC_fsqrt = 0xFC00002C,
++  PPC_fsqrts = 0xEC00002C,
++  PPC_isel = 0x7C00001E,
++  // POWER10 (ISA 3.1). RT = (CR[BI]==1) ? 1 : 0. XO=384 at bits 21-30.
++  PPC_setbc = 0x7C000300,
++  // POWER10 (ISA 3.1). RT = (CR[BI]==0) ? 1 : 0. XO=416.
++  PPC_setbcr = 0x7C000340,
++  PPC_lbarx = 0x7C000068,
++  PPC_lbz = 0x88000000,
++  PPC_lbzx = 0x7C0000AE,
++  PPC_ld = 0xE8000000,
++  PPC_ldarx = 0x7C0000A8,
++  PPC_ldx = 0x7C00002A,
++  PPC_lfd = 0xC8000000,
++  PPC_lfdx = 0x7C0004AE,
++  PPC_lfiwax = 0x7C0006AE,
++  PPC_lfiwzx = 0x7C0006EE,
++  PPC_lfs = 0xC0000000,
++  PPC_lfsx = 0x7C00042E,
++  PPC_lha = 0xA8000000,
++  PPC_lharx = 0x7C0000E8,
++  PPC_lhax = 0x7C0002AE,
++  PPC_lhz = 0xA0000000,
++  PPC_lhzx = 0x7C00022E,
++  PPC_lwa = 0xE8000002,
++  PPC_lwarx = 0x7C000028,
++  PPC_lwz = 0x80000000,
++  // X-form sign-extending word load (opcode 31, XO=341). Single-insn
++  // equivalent of lwzx + extsw.
++  PPC_lwax = 0x7C0002AA,
++  PPC_lwzx = 0x7C00002E,
++  PPC_mcrxrx = 0x7C000480,
++  PPC_mcrfs = 0xFC000080,
++  PPC_mfocrf = 0x7C100026,
++  PPC_mffs = 0xFC00048E,
++  PPC_mfspr = 0x7C0002A6,
++  PPC_mfvsrd = 0x7C000066,
++  PPC_mtcrf = 0x7C000120,
++  PPC_mtfsb0 = 0xFC00008C,
++  PPC_mtvsrd = 0x7C000166,
++  // POWER8+ (ISA 2.07). VSR[XT].dw[0] = sign_ext_64(RA[32:63]).
++  // XO=211 at bits 21-30. Combines extsw + mtvsrd into one insn.
++  PPC_mtvsrwa = 0x7C0001A6,
++  PPC_mtvsrws = 0x7C000326,
++  PPC_mtvsrwz = 0x7C0001E6,
++  PPC_mtspr = 0x7C0003A6,
++  PPC_mulhd = 0x7C000092,
++  PPC_mulhdu = 0x7C000012,
++  PPC_mulhwu = 0x7C000016,
++  PPC_mulli = 0x1C000000,
++  PPC_mulld = 0x7C0001D2,
++  PPC_mulldo = 0x7C0005D2,
++  PPC_mullw = 0x7C0001D6,
++  PPC_neg = 0x7C0000D0,
++  PPC_nor = 0x7C0000F8,
++  PPC_or_ = 0x7C000378,
++  PPC_ori = 0x60000000,
++  PPC_oris = 0x64000000,
++  PPC_popcntb = 0x7C0000F4,
++  PPC_popcntd = 0x7C0003F4,
++  PPC_popcntw = 0x7C0002F4,
++  PPC_brd = 0x7C000176,  // POWER10: byte-reverse doubleword (X-form, XO=187)
++  PPC_brh = 0x7C0001B6,  // POWER10: byte-reverse each halfword (X-form, XO=219)
++  PPC_brw = 0x7C000136,  // POWER10: byte-reverse each word     (X-form, XO=155)
++  PPC_rldcl = 0x78000010,
++  PPC_rldicl = 0x78000000,
++  PPC_rldcr = 0x78000012,
++  PPC_rldicr = 0x78000004,
++  PPC_rldimi = 0x7800000C,
++  PPC_rlwimi = 0x50000000,
++  PPC_rlwinm = 0x54000000,
++  PPC_rlwnm = 0x5C000000,
++  PPC_sld = 0x7C000036,
++  PPC_slw = 0x7C000030,
++  PPC_srad = 0x7C000634,
++  PPC_sradi = 0x7C000674,
++  PPC_sraw = 0x7C000630,
++  PPC_srawi = 0x7C000670,
++  PPC_srd = 0x7C000436,
++  PPC_srw = 0x7C000430,
++  PPC_stb = 0x98000000,
++  PPC_stbcx = 0x7C00056D,
++  PPC_stbx = 0x7C0001AE,
++  PPC_std = 0xF8000000,
++  PPC_stdcx = 0x7C0001AD,
++  PPC_stdu = 0xF8000001,
++  PPC_stdx = 0x7C00012A,
++  PPC_stfd = 0xD8000000,
++  PPC_stfdu = 0xDC000000,
++  PPC_stfdx = 0x7C0005AE,
++  PPC_stfs = 0xD0000000,
++  PPC_stfsu = 0xD4000000,
++  PPC_stfsx = 0x7C00052E,
++  PPC_sth = 0xB0000000,
++  PPC_sthcx = 0x7C0005AD,
++  PPC_sthx = 0x7C00032E,
++  PPC_stw = 0x90000000,
++  PPC_stwx = 0x7C00012E,
++  PPC_stwbrx = 0x7C00052C,
++  PPC_stwcx = 0x7C00012D,
++  PPC_subf = 0x7C000050,
++  PPC_subfc = 0x7C000010,
++  PPC_subfe = 0x7C000110,
++  PPC_subfic = 0x20000000,
++  PPC_sync = 0x7C0004AC,
++  // isync — execution synchronization. Discards prefetched instructions and
++  // forces a refetch+reexecute of everything past the barrier; prevents
++  // speculative bypass. Used for Spectre v1 mitigation in speculationBarrier.
++  // Encoding: bytes `2c 01 00 4c` (LE) = 0x4C00012C.
++  PPC_isync = 0x4C00012C,
++  PPC_trap = 0x7FE00008,
++  PPC_tw = 0x7C000008,
++  PPC_xor_ = 0x7C000278,
++  PPC_xori = 0x68000000,
++  PPC_xoris = 0x6C000000,
++  // VMX register load/store (X-form, opcode 31, XO=103/231).
++  // Operate on raw VR0-31 (the lvx/stvx mnemonics predate VSX, so the
++  // assembler exposes them with a uint8_t VR index rather than via the
++  // VSR-namespace FloatRegister overloads used for lxvx/stxvx.)
++  PPC_lvx = 0x7C0000CE,
++  PPC_lxvd2x = 0x7C000698,
++  PPC_lxvx = 0x7C000218,
++  PPC_mfvsrld = 0x7C000266,
++  PPC_mtvsrdd = 0x7C000366,
++  PPC_stvx = 0x7C0001CE,
++  PPC_stxvd2x = 0x7C000798,
++  PPC_stxvx = 0x7C000318,
++  PPC_vaddubm = 0x10000000,
++  PPC_vavgub = 0x10000402,
++  PPC_vavguh = 0x10000442,
++  PPC_vcmpequb = 0x10000006,
++  PPC_vcmpequh = 0x10000046,
++  PPC_vcmpequw = 0x10000086,
++  PPC_vcmpequd = 0x100000C7,
++  PPC_vcmpgtsb = 0x10000306,
++  PPC_vcmpgtsh = 0x10000346,
++  PPC_vcmpgtsw = 0x10000386,
++  PPC_vcmpgtsd = 0x100003C7,
++  PPC_vcmpgtub = 0x10000206,
++  PPC_vcmpgtuh = 0x10000246,
++  PPC_vcmpgtuw = 0x10000286,
++  PPC_vcmpgtud = 0x100002C7,
++  PPC_vcmpneb = 0x10000007,  // POWER9 (ISA 3.0)
++  PPC_vcmpneh = 0x10000047,  // POWER9
++  PPC_vcmpnew = 0x10000087,  // POWER9
++  PPC_vadduhm = 0x10000040,
++  PPC_vadduwm = 0x10000080,
++  PPC_vaddudm = 0x100000C0,
++  PPC_vaddubs = 0x10000200,
++  PPC_vadduhs = 0x10000240,
++  PPC_vaddsbs = 0x10000300,
++  PPC_vaddshs = 0x10000340,
++  PPC_vmaxsb = 0x10000102,
++  PPC_vmaxsh = 0x10000142,
++  PPC_vmaxsw = 0x10000182,
++  PPC_vmaxsd = 0x100001C2,
++  PPC_vmaxub = 0x10000002,
++  PPC_vmaxuh = 0x10000042,
++  PPC_vmaxuw = 0x10000082,
++  PPC_vmhraddshs = 0x10000021,
++  PPC_vmrghb = 0x1000000C,
++  PPC_vmrghh = 0x1000004C,
++  PPC_vmrghw = 0x1000008C,
++  PPC_vmrglb = 0x1000010C,
++  PPC_vmrglh = 0x1000014C,
++  PPC_vmrglw = 0x1000018C,
++  PPC_vminsb = 0x10000302,
++  PPC_vminsh = 0x10000342,
++  PPC_vminsw = 0x10000382,
++  PPC_vminub = 0x10000202,
++  PPC_vminuh = 0x10000242,
++  PPC_vminuw = 0x10000282,
++  // POWER9 (ISA 3.0) per-lane integer negate. VRA field carries the subop
++  // code: 6 for vnegw, 7 for vnegd. Base XO is 0x602.
++  PPC_vnegw = 0x10060602,
++  PPC_vnegd = 0x10070602,
++  PPC_vmladduhm = 0x10000022,
++  PPC_vmuluwm = 0x10000089,
++  PPC_vmulld = 0x100001C9,      // POWER10 (XO=457, vector i64x2 multiply low)
++  PPC_vmulesb = 0x10000308,
++  PPC_vmuleub = 0x10000208,
++  PPC_vmulesh = 0x10000348,
++  PPC_vmuleuh = 0x10000248,
++  PPC_vmulesw = 0x10000388,
++  PPC_vmuleuw = 0x10000288,
++  PPC_vmulosb = 0x10000108,
++  PPC_vmuloub = 0x10000008,
++  PPC_vmulosh = 0x10000148,
++  PPC_vmulouh = 0x10000048,
++  PPC_vmulosw = 0x10000188,
++  PPC_vmulouw = 0x10000088,
++  PPC_vmsumshm = 0x10000028,
++  PPC_vmsumuhm = 0x10000026,
++  PPC_vperm = 0x1000002B,
++  // VX-form, opcode 4, XO=0x54C. Per-byte bit-permute of a 128-bit value;
++  // result 16-bit bitmap lands in dw0 low 16 bits, recoverable via mfvsrd.
++  // Available on POWER8+ (ISA 2.07).
++  PPC_vbpermq = 0x1000054C,
++  // POWER10 (ISA 3.1) Vector Extract Mask. VX-form, opcode 4, XO=0x642,
++  // with UIM at bits 11..15 selecting lane width: 8=byte, 9=halfword,
++  // 10=word, 11=doubleword. RT is a GPR (low N bits = wasm bitmask).
++  PPC_vextractbm = 0x10080642,
++  PPC_vextracthm = 0x10090642,
++  PPC_vextractwm = 0x100A0642,
++  PPC_vextractdm = 0x100B0642,
++  // POWER10 vector insert from GPR at immediate byte offset:
++  //   vinsw VRT, RB, UIM   VRT[UIM*8:UIM*8+31] ← RB[32:63]
++  //   vinsd VRT, RB, UIM   VRT[UIM*8:UIM*8+63] ← RB[0:63]
++  // VX-form, opcode 4. RB at bits 16..20, UIM at bits 11..15.
++  PPC_vinsw = 0x100000CF,  // POWER10 (XO=207)
++  PPC_vinsd = 0x100001CF,  // POWER10 (XO=463)
++  // POWER10 vector insert byte/halfword from GPR with register-supplied
++  // (right-indexed = LE-natural) byte position:
++  //   vinsbrx VRT, RA, RB   VRT.byte[RA & 0xF]  ← RB & 0xFF
++  //   vinshrx VRT, RA, RB   VRT.hword[(RA & 0xE)/2] ← RB & 0xFFFF
++  // VX-form, opcode 4. RA at bits 16..20, RB at bits 11..15.
++  PPC_vinsbrx = 0x1000030F,  // POWER10 (XO=783)
++  PPC_vinshrx = 0x1000034F,  // POWER10 (XO=847)
++  // POWER9 (ISA 3.0) vector insert byte/halfword from VR at immediate
++  // byte position:
++  //   vinsertb VRT, VRB, UIM  VRT.byte[UIM]      ← VRB.byte[7]    (BE)
++  //   vinserth VRT, VRB, UIM  VRT.hword[UIM..+1] ← VRB.byte[6..7] (BE)
++  // V-form, opcode 4. VRB at bits 11..15, UIM at bits 16..20. Simd128
++  // lives in VSR32-63 (= VR0-31), so the V-form VRT field addresses our
++  // Simd128 storage via `encoding() & 31`.
++  PPC_vinsertb = 0x1000030D,  // POWER9 (XO=781)
++  PPC_vinserth = 0x1000034D,  // POWER9 (XO=845)
++  PPC_vextractub = 0x1000020D,  // POWER9 (XO=525)
++  PPC_vextractuh = 0x1000024D,  // POWER9 (XO=589)
++  PPC_vspltisb = 0x1000030C,    // POWER7+ (XO=780, splat 5-bit SIMM to all 16 byte lanes)
++  PPC_vspltish = 0x1000034C,    // POWER7+ (XO=844, splat 5-bit SIMM to all 8 i16 lanes)
++  PPC_vspltisw = 0x1000038C,    // POWER7+ (XO=908, splat 5-bit SIMM to all 4 i32 lanes)
++  PPC_vpopcntb = 0x10000703,
++  PPC_vslb = 0x10000104,
++  PPC_vsld = 0x100005C4,
++  PPC_vsldoi = 0x1000002C,
++  PPC_vslh = 0x10000144,
++  PPC_vslo = 0x1000040C,
++  PPC_vslw = 0x10000184,
++  PPC_vspltb = 0x1000020C,
++  PPC_vsplth = 0x1000024C,
++  PPC_vsrab = 0x10000304,
++  PPC_vsrad = 0x100003C4,
++  PPC_vsrah = 0x10000344,
++  PPC_vsraw = 0x10000384,
++  PPC_vsrb = 0x10000204,
++  PPC_vsrd = 0x100006C4,
++  PPC_vsrh = 0x10000244,
++  PPC_vsro = 0x1000044C,
++  PPC_vsrw = 0x10000284,
++  PPC_vpkshss = 0x1000018E,
++  PPC_vpkshus = 0x1000010E,
++  PPC_vpkswss = 0x100001CE,
++  PPC_vpkswus = 0x1000014E,
++  PPC_vupkhsb = 0x1000020E,
++  PPC_vupkhsh = 0x1000024E,
++  PPC_vupkhsw = 0x1000064E,
++  PPC_vupklsb = 0x1000028E,
++  PPC_vupklsh = 0x100002CE,
++  PPC_vupklsw = 0x100006CE,
++  PPC_vsububm = 0x10000400,
++  PPC_vsubuhm = 0x10000440,
++  PPC_vsubuwm = 0x10000480,
++  PPC_vsubudm = 0x100004C0,
++  PPC_vsububs = 0x10000600,
++  PPC_vsubuhs = 0x10000640,
++  PPC_vsubsbs = 0x10000700,
++  PPC_vsubshs = 0x10000740,
++  PPC_xscvdpspn = 0xF000042C,
++  PPC_xscvspdpn = 0xF000052C,
++  // POWER9 (ISA 3.0) scalar FP16 conversions, XX2-form. The UIM
++  // disambiguator is baked into the constant (xscvdphp=17, xscvhpdp=16).
++  // Encodings cross-checked against binutils with `.machine power9`.
++  PPC_xscvdphp = 0xF011056C,
++  PPC_xscvhpdp = 0xF010056C,
++  // POWER9 (ISA 3.0) scalar VSX extract biased exponent, XX2-form.
++  // XT.dword[0] = (zero || biased_exp_11bit), XT.dword[1] = 0. XO=347
++  // (shares XO with xscv{dp,hp}{hp,dp} — disambiguated by bits 16-20=0).
++  // Encoding cross-checked against binutils with `.machine power9`.
++  PPC_xsxexpdp = 0xF000056C,
++  // POWER9 (ISA 3.0) scalar FP16 load/store, X-form (opcode 31).
++  // lxsihzx zero-extends; stxsihx writes 16 bits from VSR dword 0
++  // word 1's low halfword.
++  PPC_lxsihzx = 0x7C00065A,
++  PPC_stxsihx = 0x7C00075A,
++  // POWER9 scalar VSX max/min with Java/JavaScript semantics — handles
++  // ±0 and NaN identically to Math.max/Math.min in ECMA-262 (covers
++  // 19 corner cases against the JS shell).
++  // XX3-form, primary opcode 60, XO=144 (max) / XO=152 (min).
++  PPC_xsmaxjdp = 0xF0000480,
++  PPC_xsminjdp = 0xF00004C0,
++  PPC_xxbrd = 0xF017076C,
++  PPC_xvabsdp = 0xF0000764,
++  PPC_xvabssp = 0xF0000664,
++  PPC_xvadddp = 0xF0000300,
++  PPC_xvaddsp = 0xF0000200,
++  PPC_xvcmpeqdp = 0xF0000318,
++  PPC_xvcmpeqsp = 0xF0000218,
++  PPC_xvcmpgedp = 0xF0000398,
++  PPC_xvcmpgesp = 0xF0000298,
++  PPC_xvcmpgtdp = 0xF0000358,
++  PPC_xvcmpgtsp = 0xF0000258,
++  PPC_xvcvdpsp = 0xF0000624,
++  PPC_xvcvdpsxws = 0xF0000360,
++  PPC_xvcvdpuxws = 0xF0000320,
++  PPC_xvcvspdp = 0xF0000724,
++  PPC_xvcvspsxws = 0xF0000260,
++  PPC_xvcvspuxws = 0xF0000220,
++  PPC_xvcvsxwdp = 0xF00003E0,
++  PPC_xvcvsxwsp = 0xF00002E0,
++  PPC_xvcvuxwdp = 0xF00003A0,
++  PPC_xvcvuxwsp = 0xF00002A0,
++  PPC_xvdivdp = 0xF00003C0,
++  PPC_xvdivsp = 0xF00002C0,
++  PPC_xvmaddadp = 0xF0000308,
++  PPC_xvmaddasp = 0xF0000208,
++  PPC_xvmaxdp = 0xF0000700,
++  PPC_xvmaxsp = 0xF0000600,
++  PPC_xvmindp = 0xF0000740,
++  PPC_xvminsp = 0xF0000640,
++  PPC_xvmuldp = 0xF0000380,
++  PPC_xvmulsp = 0xF0000280,
++  PPC_xvnegdp = 0xF00007E4,
++  PPC_xvnmsubadp = 0xF0000788,
++  PPC_xvnmsubasp = 0xF0000688,
++  PPC_xvnegsp = 0xF00006E4,
++  PPC_xvrdpic = 0xF00003AC,
++  PPC_xvrdpim = 0xF00003E4,
++  PPC_xvrdpip = 0xF00003A4,
++  PPC_xvrdpiz = 0xF0000364,
++  PPC_xvrspic = 0xF00002AC,
++  PPC_xvrspim = 0xF00002E4,
++  PPC_xvrspip = 0xF00002A4,
++  PPC_xvrspiz = 0xF0000264,
++  PPC_xvsqrtdp = 0xF000032C,
++  PPC_xvsqrtsp = 0xF000022C,
++  PPC_xvsubdp = 0xF0000340,
++  PPC_xvsubsp = 0xF0000240,
++  PPC_xxextractuw = 0xF0000294,
++  PPC_xxinsertw = 0xF00002D4,
++  PPC_xxland = 0xF0000410,
++  PPC_xxlandc = 0xF0000450,
++  PPC_xxlnor = 0xF0000510,
++  PPC_xxlor = 0xF0000490,
++  PPC_xxlxor = 0xF00004D0,
++  PPC_xxpermdi = 0xF0000050,
++  PPC_xxsel = 0xF0000030,
++  PPC_xxspltib = 0xF00002D0,  // POWER9 (ISA 3.0): XX1-form, no Rc
++  PPC_xxspltw = 0xF0000290,
++
++  // Simplified mnemonics.
++  PPC_mr = PPC_or_,
++  PPC_not = PPC_nor,
++  PPC_nop = PPC_ori,
++  PPC_lwsync = PPC_sync | (1 << 21),
++
++  PPC_MAJOR_OPCODE_MASK = 0xFC000000
++};
++
++static const uint32_t NopInst = (uint32_t)PPC_nop;
++static const uint32_t PPC_STANZA_LENGTH = 16;
++
++class Instruction;
++class InstReg;
++class InstImm;
++class BOffImm16;
++class JOffImm26;
++
++// PPC64 base instruction type: a single 32-bit word.
++class Instruction {
++ protected:
++  uint32_t data;
++
++ public:
++  explicit Instruction(uint32_t data_) : data(data_) {}
++  explicit Instruction(PPCOpcodes op) : data((uint32_t)op) {}
++
++  uint32_t encode() const { return data; }
++
++  void makeNop() { data = NopInst; }
++  void makeOp_mtctr(Register r) {
++    data = PPC_mtspr | ((uint32_t)r.code()) << 21 | PPC_SPR(9);
++  }
++  void makeOp_bctr(uint32_t linkBit = 0) { data = PPC_bctr | linkBit; }
++
++  void setData(uint32_t data) { this->data = data; }
++
++  const Instruction& operator=(const Instruction& src) {
++    data = src.data;
++    return *this;
++  }
++
++  uint32_t extractBit(uint32_t bit) const { return (encode() >> bit) & 1; }
++  uint32_t extractBitField(uint32_t hi, uint32_t lo) const {
++    return (encode() >> lo) & ((2 << (hi - lo)) - 1);
++  }
++
++  uint32_t extractOpcode() const { return data & PPC_MAJOR_OPCODE_MASK; }
++  bool isOpcode(uint32_t op) const {
++    return extractOpcode() == (op & PPC_MAJOR_OPCODE_MASK);
++  }
++
++  uint32_t extractRT() const {
++    return extractBitField(RTShift + RTBits - 1, RTShift);
++  }
++  uint32_t extractRA() const {
++    return extractBitField(RAShift + RABits - 1, RAShift);
++  }
++  uint32_t extractRB() const {
++    return extractBitField(RBShift + RBBits - 1, RBShift);
++  }
++  uint32_t extractImm16() const { return data & Imm16Mask; }
++
++  Instruction* next() { return this + 1; }
++
++  const uint32_t* raw() const { return &data; }
++  uint32_t size() const { return 4; }
++};
++
++static_assert(sizeof(Instruction) == 4);
++
++class InstNOP : public Instruction {
++ public:
++  InstNOP() : Instruction(NopInst) {}
++};
++
++// Register-register-register instruction (X-form and XO-form).
++class InstReg : public Instruction {
++ public:
++  explicit InstReg(PPCOpcodes op) : Instruction(op) {}
++  InstReg(PPCOpcodes op, Register rt, Register ra, Register rb)
++      : Instruction((uint32_t)op | RT(rt) | RA(ra) | RB(rb)) {}
++  InstReg(PPCOpcodes op, FloatRegister frt, FloatRegister fra,
++          FloatRegister frb)
++      : Instruction((uint32_t)op | RT(frt) | RA(fra) | RB(frb)) {}
++
++  void setRT(Register r) { data = (data & ~RTMask) | RT(r); }
++  void setRA(Register r) { data = (data & ~RAMask) | RA(r); }
++  void setRB(Register r) { data = (data & ~RBMask) | RB(r); }
++
++  void setImm16(uint32_t imm) {
++    data = (data & 0xFFFF0000) | (imm & Imm16Mask);
++  }
++  uint32_t extractImm16Value() const { return data & Imm16Mask; }
++};
++
++// Register-immediate instruction (D-form).
++// Bits 21-25 hold RT (loads, addi) or RS (stores, ori). Both encode identically
++// since RT and RS occupy the same field; the caller simply passes the right
++// register.
++class InstImm : public Instruction {
++ public:
++  explicit InstImm(PPCOpcodes op) : Instruction(op) {}
++  InstImm(PPCOpcodes op, Register rt, Register ra, uint32_t imm16)
++      : Instruction((uint32_t)op | RT(rt) | RA(ra) | (imm16 & Imm16Mask)) {}
++
++  void setRT(Register r) { data = (data & ~RTMask) | RT(r); }
++  void setRA(Register r) { data = (data & ~RAMask) | RA(r); }
++
++  void setImm16(uint32_t imm) {
++    data = (data & 0xFFFF0000) | (imm & Imm16Mask);
++  }
++  void setLowerReg(Register rl) {
++    data = (data & 0xFFE0FFFF) | ((uint32_t)rl.code() << 16);
++  }
++  uint32_t extractImm16Value() const { return data & Imm16Mask; }
++
++  // Extract the TrapTag from a tagged trap instruction (tw).
++  // Defined in Assembler-ppc64.cpp. Returns a TrapTag value as uint8_t
++  // because Assembler::TrapTag is not yet defined at this point in the header.
++  uint8_t traptag();
++};
++
++// A BOffImm16 is a 16-bit signed branch offset for conditional branches
++// (bc-form instructions).  The offset is stored in bits 2..15 and is
++// 4-byte aligned, giving a range of +/-32 KB.
++class BOffImm16 {
++  int32_t data;
++
++ public:
++  uint32_t encode() const {
++    MOZ_ASSERT(!isInvalid());
++    return static_cast<uint32_t>(data) & 0xFFFC;
++  }
++  int32_t decode() const {
++    MOZ_ASSERT(!isInvalid());
++    return data;
++  }
++
++  explicit BOffImm16(int offset) : data(offset) {
++    MOZ_ASSERT((offset & 0x3) == 0);
++    MOZ_ASSERT(IsInRange(offset));
++  }
++  static bool IsInRange(int offset) {
++    return offset >= -32768 && offset <= 32764;
++  }
++
++  static const int32_t INVALID = 0x00020000;
++  BOffImm16() : data(INVALID) {}
++
++  bool isInvalid() const { return data == INVALID; }
++
++  Instruction* getDest(Instruction* src) const;
++
++  explicit BOffImm16(InstImm inst);
++};
++
++// A JOffImm26 is a 26-bit signed branch offset for unconditional branches
++// (b/bl instructions).  Bits 2..25 encode the offset, 4-byte aligned,
++// giving a range of +/-32 MB.
++class JOffImm26 {
++  int32_t data;
++
++ public:
++  uint32_t encode() const {
++    MOZ_ASSERT(!isInvalid());
++    return static_cast<uint32_t>(data) & 0x03FFFFFC;
++  }
++  int32_t decode() const {
++    MOZ_ASSERT(!isInvalid());
++    return data;
++  }
++
++  explicit JOffImm26(int offset) : data(offset) {
++    MOZ_ASSERT((offset & 0x3) == 0);
++    MOZ_ASSERT(IsInRange(offset));
++  }
++  static bool IsInRange(int offset) {
++    return offset >= -33554432 && offset <= 33554428;
++  }
++
++  static const int32_t INVALID = 0x20000000;
++  JOffImm26() : data(INVALID) {}
++
++  bool isInvalid() const { return data == INVALID; }
++
++  Instruction* getDest(Instruction* src) const;
++};
++
++// A 16-bit immediate value used in D-form instructions.
++class Imm16 {
++  int32_t value;
++
++ public:
++  Imm16();
++  explicit Imm16(uint32_t imm) : value(imm) {}
++  uint32_t encode() const { return static_cast<uint32_t>(value) & 0xffff; }
++  int32_t decodeSigned() const { return value; }
++  uint32_t decodeUnsigned() const { return value; }
++  static bool IsInSignedRange(int32_t imm) {
++    return imm >= INT16_MIN && imm <= INT16_MAX;
++  }
++  static bool IsInUnsignedRange(uint32_t imm) { return imm <= UINT16_MAX; }
++  static Imm16 Lower(Imm32 imm) { return Imm16(imm.value & 0xffff); }
++  static Imm16 Upper(Imm32 imm) { return Imm16((imm.value >> 16) & 0xffff); }
++};
++
++class Imm8 {
++  uint8_t value;
++
++ public:
++  Imm8();
++  explicit Imm8(uint32_t imm) : value(imm) {}
++  uint32_t encode(uint32_t shift) const { return value << shift; }
++  int32_t decodeSigned() const { return value; }
++  uint32_t decodeUnsigned() const { return value; }
++  static bool IsInSignedRange(int32_t imm) {
++    return imm >= INT8_MIN && imm <= INT8_MAX;
++  }
++  static bool IsInUnsignedRange(uint32_t imm) { return imm <= UINT8_MAX; }
++  static Imm8 Lower(Imm16 imm) { return Imm8(imm.decodeSigned() & 0xff); }
++  static Imm8 Upper(Imm16 imm) {
++    return Imm8((imm.decodeSigned() >> 8) & 0xff);
++  }
++};
++
++class Operand {
++ public:
++  enum Tag { REG, FREG, MEM };
++
++ private:
++  Tag tag : 3;
++  uint32_t reg : 5;
++  int32_t offset;
++
++ public:
++  MOZ_IMPLICIT Operand(Register reg_) : tag(REG), reg(reg_.code()) {}
++
++  explicit Operand(FloatRegister freg) : tag(FREG), reg(freg.code()) {}
++
++  Operand(Register base, Imm32 off)
++      : tag(MEM), reg(base.code()), offset(off.value) {}
++
++  Operand(Register base, int32_t off)
++      : tag(MEM), reg(base.code()), offset(off) {}
++
++  explicit Operand(const Address& addr)
++      : tag(MEM), reg(addr.base.code()), offset(addr.offset) {}
++
++  Tag getTag() const { return tag; }
++
++  Register toReg() const {
++    MOZ_ASSERT(tag == REG);
++    return Register::FromCode(reg);
++  }
++
++  FloatRegister toFReg() const {
++    MOZ_ASSERT(tag == FREG);
++    return FloatRegister::FromCode(reg);
++  }
++
++  void toAddr(Register* r, Imm32* dest) const {
++    MOZ_ASSERT(tag == MEM);
++    *r = Register::FromCode(reg);
++    *dest = Imm32(offset);
++  }
++  Address toAddress() const {
++    MOZ_ASSERT(tag == MEM);
++    return Address(Register::FromCode(reg), offset);
++  }
++  int32_t disp() const {
++    MOZ_ASSERT(tag == MEM);
++    return offset;
++  }
++
++  int32_t base() const {
++    MOZ_ASSERT(tag == MEM);
++    return reg;
++  }
++  Register baseReg() const {
++    MOZ_ASSERT(tag == MEM);
++    return Register::FromCode(reg);
++  }
++};
++
++// Bug 2034064 collapsed the per-buffer compile-time configuration of
++// AssemblerBufferWithConstantPools into AssemblerBufferSettings, and reduced
++// the runtime ctor to (poolMaxOffset, nopFill). instBufferAlign and the
++// NumShortBranchRanges template arg were dropped: PPC64 previously passed
++// instBufferAlign=8 (unused on this backend; pool entries are 4-byte aligned)
++// and NumShortBranchRanges=0.
++using PPCBuffer = js::jit::AssemblerBufferWithConstantPools<
++    Instruction, Assembler,
++    js::jit::AssemblerBufferSettings{
++        .instSize = 4,
++        .guardSize = 1,
++        .headerSize = 1,
++        .pcBias = 0,
++        .alignFillInst = NopInst,
++        .nopFillInst = NopInst,
++    }>;
++
++// Inherits executableCopy() and appendRawCode() from
++// AssemblerBufferWithConstantPools, which assert pool is flushed.
++class PPCBufferWithExecutableCopy : public PPCBuffer {
++ public:
++  PPCBufferWithExecutableCopy(size_t poolMaxOffset, unsigned nopFill)
++      : PPCBuffer(poolMaxOffset, nopFill) {}
++};
++
++class Assembler : public AssemblerShared {
++ public:
++  // Trap tags encoded in the low bits of a trap word.
++  // FreeBSD and others may use r1 in their trap word, so bit 0 is avoided.
++  enum TrapTag {
++    BTag = 2,
++    BCTag = 4,
++    CallTag = 6,
++    DebugTag0 = 10,
++    DebugTag1 = 12,
++    DebugTag2 = 14
++  };
++
++  // Pool load types encoded in bits 21-22 of pool hint words.
++  // Used by InsertIndexIntoTag / PatchConstantPoolLoad.
++  enum PoolLoadType {
++    PoolLoadFPR64 = 1,    // lfd fD, offset(rBase)
++    PoolLoadSimd128 = 2,  // addi rBase, rBase, offset; lxvx vsD, 0, rBase
++    PoolLoadFPR32 = 3     // lfs fD, offset(rBase) — auto-expands to double
++  };
++
++  enum BranchBits {
++    BranchOnClear = 0x04,
++    BranchOnSet = 0x0c,
++    BranchOptionMask = 0x0f,
++    BranchOptionInvert = 0x08
++  };
++
++  // PPC condition encoding. The top nybble is the offset to the CR field
++  // (the x in BIF*4+x), and the bottom is the BO field.
++  // Synthetic flags sit in the MSB and are masked off before use.
++  enum Condition {
++    ConditionUnsigned = 0x100,
++    ConditionUnsignedHandled = 0x2ff,
++    ConditionZero = 0x400,
++    ConditionOnlyXER = 0x200,
++    ConditionXERCA = 0x23c,
++    ConditionXERNCA = 0x234,
++    ConditionXEROV = 0x21c,
++
++    Equal = 0x2c,
++    NotEqual = 0x24,
++    GreaterThan = 0x1c,
++    GreaterThanOrEqual = 0x04,
++    LessThan = 0x0c,
++    LessThanOrEqual = 0x14,
++
++    Above = GreaterThan | ConditionUnsigned,
++    AboveOrEqual = GreaterThanOrEqual | ConditionUnsigned,
++    Below = LessThan | ConditionUnsigned,
++    BelowOrEqual = LessThanOrEqual | ConditionUnsigned,
++
++    Signed = LessThan | ConditionZero,
++    NotSigned = GreaterThanOrEqual | ConditionZero,
++    Zero = Equal | ConditionZero,
++    NonZero = NotEqual | ConditionZero,
++
++    Overflow = ConditionXEROV,
++    NotOverflow = ConditionOnlyXER | LessThanOrEqual,
++    CarrySet = ConditionXERCA,
++    CarryClear = ConditionXERNCA,
++
++    Always = 0x1f,
++    SOBit = 0x3c,
++    NSOBit = 0x34
++  };
++
++  enum DoubleCondition {
++    DoubleConditionUnordered = 0x100,
++    DoubleOrdered = 0x34,
++    DoubleEqual = 0x2c,
++    DoubleNotEqual = 0x24,
++    DoubleGreaterThan = 0x1c,
++    DoubleGreaterThanOrEqual = 0x04,
++    DoubleLessThan = 0x0c,
++    DoubleLessThanOrEqual = 0x14,
++    DoubleUnordered = 0x3c,
++    DoubleEqualOrUnordered = DoubleEqual | DoubleConditionUnordered,
++    DoubleNotEqualOrUnordered = DoubleNotEqual | DoubleConditionUnordered,
++    DoubleGreaterThanOrUnordered = DoubleGreaterThan | DoubleConditionUnordered,
++    DoubleGreaterThanOrEqualOrUnordered =
++        DoubleGreaterThanOrEqual | DoubleConditionUnordered,
++    DoubleLessThanOrUnordered = DoubleLessThan | DoubleConditionUnordered,
++    DoubleLessThanOrEqualOrUnordered =
++        DoubleLessThanOrEqual | DoubleConditionUnordered,
++  };
++
++  enum JumpOrCall { BranchIsJump, BranchIsCall };
++
++  enum LinkBit {
++    DontLinkB = 0,
++    LinkB = 1,
++  };
++
++  enum LikelyBit {
++    NotLikelyB = 0,
++    LikelyB = 1,
++  };
++
++  enum BranchAddressType {
++    RelativeBranch = 0,
++    AbsoluteBranch = 2,
++  };
++
++  enum FloatFormat { SingleFloat, DoubleFloat };
++  enum FloatTestKind { TestForTrue, TestForFalse };
++
++  BufferOffset nextOffset() { return m_buffer.nextOffset(); }
++
++ protected:
++  Instruction* editSrc(BufferOffset bo) {
++    if (!bo.assigned()) {
++      // Under OOM, writeInst may return an unassigned BufferOffset.
++      // Return a dummy writable area so callers (WriteLoad64Instructions)
++      // can proceed harmlessly; the compilation will be discarded.
++      static uint32_t oomDummy_[8];
++      return (Instruction*)oomDummy_;
++    }
++    return m_buffer.getInst(bo);
++  }
++
++  struct RelativePatch {
++    BufferOffset offset;
++    void* target;
++    RelocationKind kind;
++
++    RelativePatch(BufferOffset offset, void* target, RelocationKind kind)
++        : offset(offset), target(target), kind(kind) {}
++  };
++
++  js::Vector<RelativePatch, 8, SystemAllocPolicy> jumps_;
++
++  CompactBufferWriter jumpRelocations_;
++  CompactBufferWriter dataRelocations_;
++
++  PPCBufferWithExecutableCopy m_buffer;
++
++#ifdef JS_JITSPEW
++  Sprinter* printer;
++#endif
++
++ public:
++  // Which absolute bit number does a CR + Condition pair refer to?
++  static uint8_t crBit(CRegisterID cr, Condition cond) {
++    return (cr << 2) + ((cond & 0xf0) >> 4);
++  }
++  static uint8_t crBit(CRegisterID cr, DoubleCondition cond) {
++    return (cr << 2) + ((cond & 0xf0) >> 4);
++  }
++
++  Assembler()
++      : m_buffer(/* poolMaxOffset */ 8192, /* nopFill */ 0),
++#ifdef JS_JITSPEW
++        printer(nullptr),
++#endif
++        isFinished(false),
++        scratch_register_list_((1 << Registers::r11) | (1 << Registers::r12)) {
++  }
++
++  void setUnlimitedBuffer() { m_buffer.setUnlimited(); }
++
++  // Constant pool callbacks required by AssemblerBufferWithConstantPools.
++  static void InsertIndexIntoTag(uint8_t* load, uint32_t index);
++  static bool PatchConstantPoolLoad(void* loadAddr, void* constPoolAddr);
++  static void WritePoolGuard(BufferOffset branch, Instruction* inst,
++                             BufferOffset dest);
++  static void WritePoolHeader(uint8_t* start, js::jit::Pool* p, bool isNatural);
++  static void PatchShortRangeBranchToVeneer(PPCBuffer*, unsigned rangeIdx,
++                                            BufferOffset deadline,
++                                            BufferOffset veneer);
++
++  static Condition InvertCondition(Condition cond);
++  static DoubleCondition InvertCondition(DoubleCondition cond);
++
++  void writeRelocation(BufferOffset src) {
++    jumpRelocations_.writeUnsigned(src.getOffset());
++  }
++
++  void writeDataRelocation(ImmGCPtr ptr) {
++    if (ptr.value) {
++      if (gc::IsInsideNursery(ptr.value)) {
++        embedsNurseryPointers_ = true;
++      }
++      dataRelocations_.writeUnsigned(nextOffset().getOffset());
++    }
++  }
++  void writeDataRelocation(BufferOffset bo, ImmGCPtr ptr) {
++    if (ptr.value) {
++      if (gc::IsInsideNursery(ptr.value)) {
++        embedsNurseryPointers_ = true;
++      }
++      dataRelocations_.writeUnsigned(bo.getOffset());
++    }
++  }
++
++  void assertNoGCThings() const {
++#ifdef DEBUG
++    MOZ_ASSERT(dataRelocations_.length() == 0);
++    for (auto& j : jumps_) {
++      MOZ_ASSERT(j.kind == RelocationKind::HARDCODED);
++    }
++#endif
++  }
++
++  bool oom() const;
++
++  void setPrinter(Sprinter* sp) {
++#ifdef JS_JITSPEW
++    printer = sp;
++#endif
++  }
++
++#ifdef JS_JITSPEW
++  inline void spew(const char* fmt, ...) MOZ_FORMAT_PRINTF(2, 3) {
++    if (MOZ_UNLIKELY(printer || JitSpewEnabled(JitSpew_Codegen))) {
++      va_list va;
++      va_start(va, fmt);
++      spewVA(fmt, va);
++      va_end(va);
++    }
++  }
++  MOZ_COLD void spewVA(const char* fmt, va_list va) MOZ_FORMAT_PRINTF(2, 0) {
++    char buf[200];
++    int i = VsprintfLiteral(buf, fmt, va);
++    if (i > -1) {
++      if (printer) {
++        printer->printf("%s\n", buf);
++      }
++      js::jit::JitSpew(js::jit::JitSpew_Codegen, "%s", buf);
++    }
++  }
++#else
++  MOZ_ALWAYS_INLINE void spew(const char* fmt, ...) MOZ_FORMAT_PRINTF(2, 3) {}
++#endif
++
++  Register getStackPointer() const { return StackPointer; }
++
++ protected:
++  bool isFinished;
++
++ public:
++  static uintptr_t GetPointer(uint8_t*);
++  void flush() {
++    MOZ_ASSERT(!isFinished);
++    m_buffer.flushPool();
++  }
++  // Inhibit pool flushes for the next maxInst instructions. Mirrors the
++  // ARM/ARM64 wrappers; lets shared code (e.g. WasmFrameIter epilogues
++  // that need static byte distances between currentOffset() captures)
++  // fence a small instruction window without reaching into m_buffer.
++  void enterNoPool(size_t maxInst) { m_buffer.enterNoPool(maxInst); }
++  void leaveNoPool() { m_buffer.leaveNoPool(); }
++  void finish();
++  bool appendRawCode(const uint8_t* code, size_t numBytes);
++  bool reserve(size_t size);
++  bool swapBuffer(wasm::Bytes& bytes);
++  void executableCopy(void* buffer);
++  void copyJumpRelocationTable(uint8_t* dest);
++  void copyDataRelocationTable(uint8_t* dest);
++
++  size_t size() const;
++  size_t jumpRelocationTableBytes() const;
++  size_t dataRelocationTableBytes() const;
++  size_t bytesNeeded() const;
++
++  BufferOffset writeInst(uint32_t x, uint32_t* dest = nullptr);
++  static void WriteInstStatic(uint32_t x, uint32_t* dest);
++
++ public:
++  BufferOffset haltingAlign(int alignment);
++  BufferOffset nopAlign(int alignment);
++  BufferOffset as_nop();
++
++  // --- Instruction emission (declarations only, implemented in later commits)
++
++  // Branch instructions.
++  uint16_t computeConditionCode(Condition op, CRegisterID cr = cr0);
++  uint16_t computeConditionCode(DoubleCondition cond, CRegisterID cr = cr0);
++  BufferOffset as_b(JOffImm26 off, BranchAddressType bat = RelativeBranch,
++                    LinkBit lb = DontLinkB);
++  BufferOffset as_b(int32_t off, BranchAddressType bat = RelativeBranch,
++                    LinkBit lb = DontLinkB);
++  BufferOffset as_blr(LinkBit lb = DontLinkB);
++  BufferOffset as_bctr(LinkBit lb = DontLinkB);
++  BufferOffset as_bc(BOffImm16 off, Condition cond, CRegisterID cr = cr0,
++                     LikelyBit lkb = NotLikelyB, LinkBit lb = DontLinkB);
++  BufferOffset as_bc(int16_t off, Condition cond, CRegisterID cr = cr0,
++                     LikelyBit lkb = NotLikelyB, LinkBit lb = DontLinkB);
++  BufferOffset as_bc(BOffImm16 off, DoubleCondition cond, CRegisterID cr = cr0,
++                     LikelyBit lkb = NotLikelyB, LinkBit lb = DontLinkB);
++  BufferOffset as_bc(int16_t off, DoubleCondition cond, CRegisterID cr = cr0,
++                     LikelyBit lkb = NotLikelyB, LinkBit lb = DontLinkB);
++  BufferOffset as_bcctr(Condition cond, CRegisterID cr = cr0,
++                        LikelyBit lkb = NotLikelyB, LinkBit lb = DontLinkB);
++  BufferOffset as_bcctr(DoubleCondition cond, CRegisterID cr = cr0,
++                        LikelyBit lkb = NotLikelyB, LinkBit lb = DontLinkB);
++  BufferOffset as_bc(int16_t off, uint16_t op, LikelyBit lkb = NotLikelyB,
++                     LinkBit lb = DontLinkB);
++  BufferOffset as_bcctr(uint16_t op, LikelyBit lkb = NotLikelyB,
++                        LinkBit lb = DontLinkB);
++
++  // SPR operations.
++  BufferOffset as_mtspr(SPRegisterID spr, Register ra);
++  BufferOffset as_mfspr(Register rd, SPRegisterID spr);
++
++  // CR operations.
++  BufferOffset as_crand(uint8_t t, uint8_t a, uint8_t b);
++  BufferOffset as_crandc(uint8_t t, uint8_t a, uint8_t b);
++  BufferOffset as_cror(uint8_t t, uint8_t a, uint8_t b);
++  BufferOffset as_crorc(uint8_t t, uint8_t a, uint8_t b);
++  BufferOffset as_crxor(uint8_t t, uint8_t a, uint8_t b);
++  BufferOffset as_mtcrf(uint32_t mask, Register rs);
++  BufferOffset as_mfocrf(Register rd, CRegisterID crfs);
++  BufferOffset as_mcrxrx(CRegisterID crt);
++
++  // Compare instructions.
++  BufferOffset as_cmpd(CRegisterID cr, Register ra, Register rb);
++  BufferOffset as_cmpdi(CRegisterID cr, Register ra, int16_t im);
++  BufferOffset as_cmpld(CRegisterID cr, Register ra, Register rb);
++  BufferOffset as_cmpldi(CRegisterID cr, Register ra, int16_t im);
++  BufferOffset as_cmpw(CRegisterID cr, Register ra, Register rb);
++  BufferOffset as_cmpwi(CRegisterID cr, Register ra, int16_t im);
++  BufferOffset as_cmplw(CRegisterID cr, Register ra, Register rb);
++  BufferOffset as_cmplwi(CRegisterID cr, Register ra, int16_t im);
++  BufferOffset as_cmpd(Register ra, Register rb);
++  BufferOffset as_cmpdi(Register ra, int16_t im);
++  BufferOffset as_cmpld(Register ra, Register rb);
++  BufferOffset as_cmpldi(Register ra, int16_t im);
++  BufferOffset as_cmpw(Register ra, Register rb);
++  BufferOffset as_cmpwi(Register ra, int16_t im);
++  BufferOffset as_cmplw(Register ra, Register rb);
++  BufferOffset as_cmplwi(Register ra, int16_t im);
++
++  // ALU (three-register).
++  BufferOffset as_add(Register rd, Register ra, Register rb);
++  BufferOffset as_addc(Register rd, Register ra, Register rb);
++  BufferOffset as_adde(Register rd, Register ra, Register rb);
++  BufferOffset as_subf(Register rd, Register ra, Register rb);
++  BufferOffset as_subfc(Register rd, Register ra, Register rb);
++  BufferOffset as_subfe(Register rd, Register ra, Register rb);
++  BufferOffset as_neg(Register rd, Register rs);
++
++  BufferOffset as_mulld(Register rd, Register ra, Register rb);
++  BufferOffset as_mulhd(Register rd, Register ra, Register rb);
++  BufferOffset as_mulhdu(Register rd, Register ra, Register rb);
++  BufferOffset as_mulldo(Register rd, Register ra, Register rb);
++  BufferOffset as_mullw(Register rd, Register ra, Register rb);
++  BufferOffset as_mulhwu(Register rd, Register ra, Register rb);
++
++  BufferOffset as_divd(Register rd, Register ra, Register rb);
++  BufferOffset as_divdu(Register rd, Register ra, Register rb);
++  BufferOffset as_divw(Register rd, Register ra, Register rb);
++  BufferOffset as_divwu(Register rd, Register ra, Register rb);
++  // POWER9 modulo.
++  BufferOffset as_modsd(Register rd, Register ra, Register rb);
++  BufferOffset as_modsw(Register rd, Register ra, Register rb);
++  BufferOffset as_modud(Register rd, Register ra, Register rb);
++  BufferOffset as_moduw(Register rd, Register ra, Register rb);
++
++  // ALU immediate.
++  BufferOffset as_addi(Register rd, Register ra, int16_t im,
++                       bool actually_li = false);
++  BufferOffset as_addis(Register rd, Register ra, int16_t im,
++                        bool actually_lis = false);
++  BufferOffset as_mulli(Register rd, Register ra, int16_t im);
++  BufferOffset as_subfic(Register rd, Register ra, int16_t im);
++
++  // ALU unary/extended.
++  BufferOffset as_cntlzw(Register rd, Register ra);
++  BufferOffset as_cntlzd(Register rd, Register ra);
++  BufferOffset as_cnttzd(Register rd, Register ra);
++  BufferOffset as_cnttzw(Register rd, Register ra);
++  BufferOffset as_popcntd(Register ra, Register rs);
++  BufferOffset as_popcntw(Register ra, Register rs);
++  // POWER10 byte-reverse doubleword: ra = bswap64(rs). 1 insn replacing the
++  // POWER9 mtvsrd / xxbrd / mfvsrd round-trip in byteSwap64.
++  BufferOffset as_brd(Register ra, Register rs);
++  // POWER10 byte-reverse each halfword (4 halfwords) / each word (2 words)
++  // in the 64-bit doubleword. The wasm/asm caller usually masks or
++  // sign-extends the low halfword/word afterwards.
++  BufferOffset as_brh(Register ra, Register rs);
++  BufferOffset as_brw(Register ra, Register rs);
++
++  // Bit operations (logical, three-register).
++  BufferOffset as_and_(Register rd, Register rs, Register rb);
++  BufferOffset as_and__rc(Register rd, Register rs, Register rb);
++  BufferOffset as_nor(Register rd, Register rs, Register rb);
++  BufferOffset as_or_(Register rd, Register rs, Register rb);
++  BufferOffset as_xor_(Register rd, Register rs, Register rb);
++  BufferOffset as_slw(Register rd, Register rs, Register rb);
++  BufferOffset as_srw(Register rd, Register rs, Register rb);
++  BufferOffset as_sraw(Register rd, Register rs, Register rb);
++  BufferOffset as_sld(Register rd, Register rs, Register rb);
++  BufferOffset as_srd(Register rd, Register rs, Register rb);
++  BufferOffset as_srad(Register rd, Register rs, Register rb);
++
++  // Bit operations (logical, immediate).
++  BufferOffset as_ori(Register rd, Register ra, uint16_t im);
++  BufferOffset as_oris(Register rd, Register ra, uint16_t im);
++  BufferOffset as_xori(Register rd, Register ra, uint16_t im);
++  BufferOffset as_xoris(Register rd, Register ra, uint16_t im);
++  BufferOffset as_andi_rc(Register rd, Register ra, uint16_t im);
++
++  // Sign extension.
++  BufferOffset as_extsb(Register rd, Register rs);
++  BufferOffset as_extsh(Register rd, Register rs);
++  BufferOffset as_extsw(Register rd, Register rs);
++  BufferOffset as_extsw_rc(Register rd, Register rs);
++
++  // Shift/rotate with immediates.
++  BufferOffset as_srawi(Register id, Register rs, uint8_t n);
++  BufferOffset as_sradi(Register rd, Register rs, int n);
++  BufferOffset as_rldcl(Register ra, Register rs, Register rb, uint8_t mb);
++  BufferOffset as_rldicl(Register ra, Register rs, uint8_t sh, uint8_t mb);
++  BufferOffset as_rldicl_rc(Register ra, Register rs, uint8_t sh, uint8_t mb);
++  BufferOffset as_rldicr(Register ra, Register rs, uint8_t sh, uint8_t mb);
++  BufferOffset as_rldicr_rc(Register ra, Register rs, uint8_t sh, uint8_t mb);
++  BufferOffset as_rlwinm(Register rd, Register rs, uint8_t sh, uint8_t mb,
++                         uint8_t me);
++  BufferOffset as_rlwinm_rc(Register rd, Register rs, uint8_t sh, uint8_t mb,
++                            uint8_t me);
++  BufferOffset as_rlwimi(Register rd, Register rs, uint8_t sh, uint8_t mb,
++                         uint8_t me);
++  BufferOffset as_rldimi(Register rd, Register rs, uint8_t sh, uint8_t mb);
++  BufferOffset as_rlwnm(Register rd, Register rs, Register rb, uint8_t mb,
++                        uint8_t me);
++
++  // Integer loads (D-form).
++  BufferOffset as_lbz(Register rd, Register rb, int16_t off);
++  BufferOffset as_lha(Register rd, Register rb, int16_t off);
++  BufferOffset as_lhz(Register rd, Register rb, int16_t off);
++  BufferOffset as_lwa(Register rd, Register rb, int16_t off);
++  BufferOffset as_lwz(Register rd, Register rb, int16_t off);
++  BufferOffset as_ld(Register rd, Register rb, int16_t off);
++
++  // Integer stores (D-form).
++  BufferOffset as_stb(Register rd, Register rb, int16_t off);
++  BufferOffset as_sth(Register rd, Register rb, int16_t off);
++  BufferOffset as_stw(Register rd, Register rb, int16_t off);
++  BufferOffset as_std(Register rd, Register rb, int16_t off);
++  BufferOffset as_stdu(Register rd, Register rb, int16_t off);
++
++  // Integer loads/stores (X-form, indexed).
++  BufferOffset as_lbzx(Register rd, Register ra, Register rb);
++  BufferOffset as_lhax(Register rd, Register ra, Register rb);
++  BufferOffset as_lhzx(Register rd, Register ra, Register rb);
++  BufferOffset as_lwzx(Register rd, Register ra, Register rb);
++  // X-form sign-extending word load. Single-insn equivalent of lwzx + extsw.
++  BufferOffset as_lwax(Register rd, Register ra, Register rb);
++  BufferOffset as_lwarx(Register rd, Register ra, Register rb);
++  BufferOffset as_lbarx(Register rd, Register ra, Register rb);
++  BufferOffset as_lharx(Register rd, Register ra, Register rb);
++  BufferOffset as_ldx(Register rd, Register ra, Register rb);
++  BufferOffset as_ldarx(Register rd, Register ra, Register rb);
++  BufferOffset as_stbx(Register rd, Register ra, Register rb);
++  BufferOffset as_stbcx(Register rd, Register ra, Register rb);
++  BufferOffset as_stwx(Register rd, Register ra, Register rb);
++  BufferOffset as_stwbrx(Register rd, Register ra, Register rb);
++  BufferOffset as_sthx(Register rd, Register ra, Register rb);
++  BufferOffset as_sthcx(Register rd, Register ra, Register rb);
++  BufferOffset as_stdx(Register rd, Register ra, Register rb);
++  BufferOffset as_stdcx(Register rd, Register ra, Register rb);
++  BufferOffset as_stwcx(Register rd, Register ra, Register rb);
++
++  // Integer select.
++  // POWER10 (ISA 3.1). Set RT = 1/0 based on a CR bit.
++  BufferOffset as_setbc(Register rt, uint16_t bc, CRegisterID cr);
++  BufferOffset as_setbcr(Register rt, uint16_t bc, CRegisterID cr);
++  BufferOffset as_isel(Register rt, Register ra, Register rb, uint16_t rc,
++                       CRegisterID cr = cr0);
++  BufferOffset as_isel0(Register rt, Register ra, Register rb, uint16_t rc,
++                        CRegisterID cr = cr0);
++
++  // FP compare.
++  BufferOffset as_fcmpu(CRegisterID cr, FloatRegister ra, FloatRegister rb);
++  BufferOffset as_fcmpu(FloatRegister ra, FloatRegister rb);
++
++  // FP arithmetic (two-source).
++  BufferOffset as_fadd(FloatRegister rd, FloatRegister ra, FloatRegister rc);
++  BufferOffset as_fadds(FloatRegister rd, FloatRegister ra, FloatRegister rc);
++  BufferOffset as_fsub(FloatRegister rd, FloatRegister ra, FloatRegister rc);
++  BufferOffset as_fsubs(FloatRegister rd, FloatRegister ra, FloatRegister rc);
++  BufferOffset as_fdiv(FloatRegister rd, FloatRegister ra, FloatRegister rc);
++  BufferOffset as_fdivs(FloatRegister rd, FloatRegister ra, FloatRegister rc);
++  BufferOffset as_fmul(FloatRegister rd, FloatRegister ra, FloatRegister rc);
++  BufferOffset as_fmuls(FloatRegister rd, FloatRegister ra, FloatRegister rc);
++  BufferOffset as_fcpsgn(FloatRegister rd, FloatRegister ra, FloatRegister rc);
++  // FP unary.
++  BufferOffset as_fabs(FloatRegister rd, FloatRegister rs);
++  BufferOffset as_fneg(FloatRegister rd, FloatRegister rs);
++  BufferOffset as_fmr(FloatRegister rd, FloatRegister rs);
++  BufferOffset as_fsqrt(FloatRegister rd, FloatRegister rs);
++  BufferOffset as_fsqrts(FloatRegister rd, FloatRegister rs);
++  BufferOffset as_frsp(FloatRegister rd, FloatRegister rs);
++
++  // FP conversions.
++  BufferOffset as_fcfid(FloatRegister rd, FloatRegister rs);
++  BufferOffset as_fcfids(FloatRegister rd, FloatRegister rs);
++  BufferOffset as_fcfidu(FloatRegister rd, FloatRegister rs);
++  BufferOffset as_fcfidus(FloatRegister rd, FloatRegister rs);
++  BufferOffset as_fctid(FloatRegister rd, FloatRegister rs);
++  BufferOffset as_fctidz(FloatRegister rd, FloatRegister rs);
++  BufferOffset as_fctiduz(FloatRegister rd, FloatRegister rs);
++  BufferOffset as_fctiwz(FloatRegister rd, FloatRegister rs);
++
++  // FP rounding.
++  BufferOffset as_frim(FloatRegister rd, FloatRegister rs);
++  BufferOffset as_frip(FloatRegister rd, FloatRegister rs);
++  BufferOffset as_friz(FloatRegister rd, FloatRegister rs);
++
++  // FP loads (D-form).
++  BufferOffset as_lfd(FloatRegister rd, Register rb, int16_t off);
++  BufferOffset as_lfs(FloatRegister rd, Register rb, int16_t off);
++
++  // FP stores (D-form).
++  BufferOffset as_stfd(FloatRegister rd, Register rb, int16_t off);
++  BufferOffset as_stfs(FloatRegister rd, Register rb, int16_t off);
++  BufferOffset as_stfdu(FloatRegister rd, Register rb, int16_t off);
++  BufferOffset as_stfsu(FloatRegister rd, Register rb, int16_t off);
++
++  // FP loads/stores (X-form, indexed).
++  BufferOffset as_lfdx(FloatRegister rd, Register ra, Register rb);
++  BufferOffset as_lfsx(FloatRegister rd, Register ra, Register rb);
++  BufferOffset as_lfiwax(FloatRegister rd, Register ra, Register rb);
++  BufferOffset as_stfdx(FloatRegister rd, Register ra, Register rb);
++  BufferOffset as_stfsx(FloatRegister rd, Register ra, Register rb);
++
++  // FPSCR operations.
++  BufferOffset as_mtfsb0(uint8_t bt);
++  BufferOffset as_mcrfs(CRegisterID bf, uint8_t bfa);
++
++  // VSX (FPR-only subset).
++  BufferOffset as_mfvsrd(Register ra, FloatRegister xs);
++  BufferOffset as_mtvsrd(FloatRegister xs, Register ra);
++  // POWER8+ (ISA 2.07). Sign-extending move of RA's low 32 bits to FPR.
++  BufferOffset as_mtvsrwa(FloatRegister xs, Register ra);
++  BufferOffset as_mtvsrwz(FloatRegister xs, Register ra);
++  BufferOffset as_mtvsrws(FloatRegister xs, Register ra);
++  BufferOffset as_xxbrd(FloatRegister xt, FloatRegister xb);
++  // POWER9 scalar VSX max/min with Java/JavaScript semantics (matches
++  // ECMA-262 Math.max / Math.min). Operate on FPR-space (encoding 0..31).
++  BufferOffset as_xsmaxjdp(FloatRegister xt, FloatRegister xa,
++                           FloatRegister xb);
++  BufferOffset as_xsminjdp(FloatRegister xt, FloatRegister xa,
++                           FloatRegister xb);
++  BufferOffset as_xscvdpspn(FloatRegister xt, FloatRegister xb);
++  BufferOffset as_xscvspdpn(FloatRegister xt, FloatRegister xb);
++  // POWER9 (ISA 3.0) scalar FP16 conversions.
++  BufferOffset as_xscvdphp(FloatRegister xt, FloatRegister xb);
++  BufferOffset as_xscvhpdp(FloatRegister xt, FloatRegister xb);
++  // POWER9 (ISA 3.0) scalar extract biased exponent.
++  BufferOffset as_xsxexpdp(FloatRegister xt, FloatRegister xb);
++  // POWER9 (ISA 3.0) scalar FP16 load/store, X-form indexed.
++  BufferOffset as_lxsihzx(FloatRegister xt, Register ra, Register rb);
++  BufferOffset as_stxsihx(FloatRegister xs, Register ra, Register rb);
++
++  // VSX SIMD load/store (X-form, indexed).
++  BufferOffset as_lxvx(FloatRegister xt, Register ra, Register rb);
++  BufferOffset as_stxvx(FloatRegister xs, Register ra, Register rb);
++  BufferOffset as_lxvd2x(FloatRegister xt, Register ra, Register rb);
++  BufferOffset as_stxvd2x(FloatRegister xs, Register ra, Register rb);
++
++  // VMX SIMD load/store (X-form, indexed). Take a raw VR number (0-31)
++  // because VR20-VR31 are outside the FloatRegister encoding (which only
++  // covers VSR0-31 = f0-f31). Used by the JIT trampoline to save/restore
++  // the ELFv2 callee-saved VR20-VR31. EA is force-aligned to 16 bytes
++  // (low 4 bits of the address are ignored), so the slot's alignment
++  // matters for layout but not for trap avoidance.
++  BufferOffset as_lvx(uint8_t vrt, Register ra, Register rb);
++  BufferOffset as_stvx(uint8_t vrs, Register ra, Register rb);
++
++  // VSX SIMD register operations (XX3-form / XX1-form / XX2-form).
++  BufferOffset as_xxlor(FloatRegister xt, FloatRegister xa, FloatRegister xb);
++
++  // VSX bitwise operations (XX3-form).
++  BufferOffset as_xxland(FloatRegister xt, FloatRegister xa, FloatRegister xb);
++  BufferOffset as_xxlxor(FloatRegister xt, FloatRegister xa, FloatRegister xb);
++  BufferOffset as_xxlnor(FloatRegister xt, FloatRegister xa, FloatRegister xb);
++  BufferOffset as_xxlandc(FloatRegister xt, FloatRegister xa, FloatRegister xb);
++  BufferOffset as_xxsel(FloatRegister xt, FloatRegister xa, FloatRegister xb,
++                        FloatRegister xc);
++
++  // VMX integer arithmetic (VR0-31 = VSR32-63 only).
++  // Callers must ensure operands are in VR space.
++  BufferOffset as_vaddubm(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vadduhm(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vadduwm(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vaddudm(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vsububm(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vsubuhm(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vsubuwm(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vsubudm(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vaddsbs(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vaddshs(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vaddubs(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vadduhs(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vsubsbs(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vsubshs(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vsububs(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vsubuhs(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vminsb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vminsh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vminsw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmaxsb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmaxsh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmaxsw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmaxsd(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vminub(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vminuh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vminuw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmaxub(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmaxuh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmaxuw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  // POWER9 (ISA 3.0): per-lane integer negate.
++  BufferOffset as_vnegw(uint8_t vrt, uint8_t vrb);
++  BufferOffset as_vnegd(uint8_t vrt, uint8_t vrb);
++  // POWER9 (ISA 3.0): addpcis rT, D.  Computes rT = (CIA + 4) + (D << 16).
++  // D is a 16-bit signed immediate; DX-form splits D across three instruction
++  // fields (d0[16..25] ∥ d1[11..15] ∥ d2[31]).  No LR clobber, no RAS hazard.
++  BufferOffset as_addpcis(Register rt, int16_t d);
++  // POWER10 (ISA 3.1) prefixed instructions. Each emits 8 bytes (prefix +
++  // suffix) with a single nop inserted before iff the prefix would
++  // straddle a 64-byte block. Caller must guarantee HasPOWER10().
++  // imm34 is signed 34-bit; R=true selects PC-relative form (RA must be r0).
++  // Returns the offset of the prefix word.
++  BufferOffset as_paddi(Register rt, Register ra, int64_t imm34, bool R);
++  BufferOffset as_pld(Register rt, Register ra, int64_t imm34, bool R);
++  BufferOffset as_plxv(uint8_t xt, Register ra, int64_t imm34, bool R);
++  // FP-target prefixed loads: plfd/plfs are MLS (Type=2) with suffix
++  // opcodes 50 and 48. plfs widens single → double in the FPR
++  // (matches non-prefixed lfs semantics).
++  BufferOffset as_plfd(FloatRegister frt, Register ra, int64_t imm34,
++                       bool R);
++  BufferOffset as_plfs(FloatRegister frt, Register ra, int64_t imm34,
++                       bool R);
++  // Prefixed-store counterparts. Same prefix shape; suffix opcodes are
++  // the D-form variants of std/stxv/stfd/stfs (61, 27, 54, 52).
++  BufferOffset as_pstd(Register rs, Register ra, int64_t imm34, bool R);
++  BufferOffset as_pstxv(uint8_t xs, Register ra, int64_t imm34, bool R);
++  BufferOffset as_pstfd(FloatRegister frs, Register ra, int64_t imm34,
++                        bool R);
++  BufferOffset as_pstfs(FloatRegister frs, Register ra, int64_t imm34,
++                        bool R);
++
++ private:
++  // Emit a nop before a prefixed instruction iff the prefix would otherwise
++  // start at offset 60 (mod 64) and the suffix would land in the next block.
++  void ensurePrefixedAlignment();
++
++ public:
++  BufferOffset as_vavgub(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vavguh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmuluwm(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmulld(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  // VMX shift (VR0-31 only).
++  BufferOffset as_vslb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vslh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vslw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vsld(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vsrb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vsrh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vsrw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vsrd(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vsrab(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vsrah(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vsraw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vsrad(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vslo(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vsro(uint8_t vrt, uint8_t vra, uint8_t vrb);
++
++  // VMX integer compare (VR0-31 only).
++  BufferOffset as_vcmpequb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vcmpequh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vcmpequw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vcmpequd(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  // Record forms set CR6: LT = all-true, EQ = none-true.
++  BufferOffset as_vcmpequb_rc(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vcmpequh_rc(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vcmpequw_rc(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vcmpequd_rc(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vcmpgtsb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vcmpgtsh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vcmpgtsw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vcmpgtsd(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vcmpgtub(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vcmpgtuh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vcmpgtuw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vcmpgtud(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  // POWER9 (ISA 3.0). NotEqual compare; no doubleword variant.
++  BufferOffset as_vcmpneb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vcmpneh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vcmpnew(uint8_t vrt, uint8_t vra, uint8_t vrb);
++
++  // VSX float compare (XX3-form, VSR0-63).
++  BufferOffset as_xvcmpeqsp(FloatRegister xt, FloatRegister xa,
++                            FloatRegister xb);
++  BufferOffset as_xvcmpgtsp(FloatRegister xt, FloatRegister xa,
++                            FloatRegister xb);
++  BufferOffset as_xvcmpgesp(FloatRegister xt, FloatRegister xa,
++                            FloatRegister xb);
++  BufferOffset as_xvcmpeqdp(FloatRegister xt, FloatRegister xa,
++                            FloatRegister xb);
++  BufferOffset as_xvcmpgtdp(FloatRegister xt, FloatRegister xa,
++                            FloatRegister xb);
++  BufferOffset as_xvcmpgedp(FloatRegister xt, FloatRegister xa,
++                            FloatRegister xb);
++
++  // VSX float arithmetic (XX3-form binary, XX2-form unary).
++#define DECL_VSX_BIN(op) \
++  BufferOffset as_##op(FloatRegister xt, FloatRegister xa, FloatRegister xb);
++  DECL_VSX_BIN(xvaddsp)
++  DECL_VSX_BIN(xvadddp) DECL_VSX_BIN(xvsubsp) DECL_VSX_BIN(
++      xvsubdp) DECL_VSX_BIN(xvmulsp) DECL_VSX_BIN(xvmuldp) DECL_VSX_BIN(xvdivsp)
++      DECL_VSX_BIN(xvdivdp) DECL_VSX_BIN(xvminsp) DECL_VSX_BIN(
++          xvmindp) DECL_VSX_BIN(xvmaxsp) DECL_VSX_BIN(xvmaxdp)
++          DECL_VSX_BIN(xvmaddasp) DECL_VSX_BIN(xvmaddadp) DECL_VSX_BIN(
++              xvnmsubasp) DECL_VSX_BIN(xvnmsubadp)
++#undef DECL_VSX_BIN
++#define DECL_VSX_UN(op) \
++  BufferOffset as_##op(FloatRegister xt, FloatRegister xb);
++              DECL_VSX_UN(xvabssp) DECL_VSX_UN(xvabsdp) DECL_VSX_UN(xvnegsp)
++                  DECL_VSX_UN(xvnegdp) DECL_VSX_UN(xvsqrtsp) DECL_VSX_UN(
++                      xvsqrtdp) DECL_VSX_UN(xvrspip) DECL_VSX_UN(xvrdpip)
++                      DECL_VSX_UN(xvrspim) DECL_VSX_UN(xvrdpim) DECL_VSX_UN(
++                          xvrspiz) DECL_VSX_UN(xvrdpiz) DECL_VSX_UN(xvrspic)
++                          DECL_VSX_UN(xvrdpic) DECL_VSX_UN(xvcvsxwsp)
++                              DECL_VSX_UN(xvcvuxwsp) DECL_VSX_UN(xvcvsxwdp)
++                                  DECL_VSX_UN(xvcvuxwdp) DECL_VSX_UN(xvcvspsxws)
++                                      DECL_VSX_UN(xvcvspuxws)
++                                          DECL_VSX_UN(xvcvdpsxws)
++                                              DECL_VSX_UN(xvcvdpuxws)
++                                                  DECL_VSX_UN(xvcvdpsp)
++                                                      DECL_VSX_UN(xvcvspdp)
++#undef DECL_VSX_UN
++
++      // VMX widen/narrow/merge/pack (VR0-31 only).
++      BufferOffset as_vupkhsb(uint8_t vrt, uint8_t vrb);
++  BufferOffset as_vupklsb(uint8_t vrt, uint8_t vrb);
++  BufferOffset as_vupkhsh(uint8_t vrt, uint8_t vrb);
++  BufferOffset as_vupklsh(uint8_t vrt, uint8_t vrb);
++  BufferOffset as_vupkhsw(uint8_t vrt, uint8_t vrb);
++  BufferOffset as_vupklsw(uint8_t vrt, uint8_t vrb);
++  BufferOffset as_vpkshss(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vpkswss(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vpkshus(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vpkswus(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmrghb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmrghh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmrghw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmrglb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmrglh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmrglw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++
++  // VMX extended multiply (VR0-31 only).
++  BufferOffset as_vmulesb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmulosb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmuleub(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmuloub(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmulesh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmulosh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmuleuh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmulouh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmulesw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmulosw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmuleuw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vmulouw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  BufferOffset as_vpopcntb(uint8_t vrt, uint8_t vrb);
++  BufferOffset as_vperm(uint8_t vrt, uint8_t vra, uint8_t vrb, uint8_t vrc);
++  // POWER8+ (ISA 2.07). VX-form bit-permute. See PPC_vbpermq comment.
++  BufferOffset as_vbpermq(uint8_t vrt, uint8_t vra, uint8_t vrb);
++  // POWER10 (ISA 3.1) Vector Extract Mask. RT is a GPR.
++  BufferOffset as_vextractbm(Register rt, FloatRegister vrb);
++  BufferOffset as_vextracthm(Register rt, FloatRegister vrb);
++  BufferOffset as_vextractwm(Register rt, FloatRegister vrb);
++  BufferOffset as_vextractdm(Register rt, FloatRegister vrb);
++  // POWER10 (ISA 3.1) Vector Insert from GPR at immediate byte offset.
++  // UIM range: vinsw 0..12, vinsd 0..8 (caller must enforce).
++  BufferOffset as_vinsw(FloatRegister vrt, Register rb, uint8_t uim);
++  BufferOffset as_vinsd(FloatRegister vrt, Register rb, uint8_t uim);
++  // POWER10 (ISA 3.1) Vector Insert byte / halfword from GPR with the
++  // byte position supplied by another GPR (RA & 0xF for vinsbrx,
++  // RA & 0xE for vinshrx). "rx" = right-indexed = LE-natural.
++  BufferOffset as_vinsbrx(FloatRegister vrt, Register ra, Register rb);
++  BufferOffset as_vinshrx(FloatRegister vrt, Register ra, Register rb);
++  // POWER9 (ISA 3.0) Vector Insert byte / halfword from VR at immediate
++  // byte position. UIM range: vinsertb 0..15, vinserth 0..14
++  // (caller must enforce; vinserth UIM is in bytes, even-aligned).
++  BufferOffset as_vinsertb(FloatRegister vrt, FloatRegister vrb, uint8_t uim);
++  BufferOffset as_vinserth(FloatRegister vrt, FloatRegister vrb, uint8_t uim);
++  // POWER9 (ISA 3.0) Vector Extract byte / halfword from VR at immediate
++  // BE byte position. UIM range: vextractub 0..15, vextractuh 0..14
++  // (caller must enforce; vextractuh UIM is in bytes, even-aligned). The
++  // extracted byte/halfword lands at BE byte 7 of VRT, with the rest
++  // zeroed — so a subsequent mfvsrd reads it as the low byte/halfword
++  // of the GPR with implicit zero-extension.
++  BufferOffset as_vextractub(FloatRegister vrt, FloatRegister vrb, uint8_t uim);
++  BufferOffset as_vextractuh(FloatRegister vrt, FloatRegister vrb, uint8_t uim);
++  // VX-form with 5-bit signed immediate splat: each lane of VRT is
++  // set to sign_extend(SIMM5) (range [-16, 15]) at byte/halfword/word granularity.
++  BufferOffset as_vspltisb(uint8_t vrt, int8_t simm5);
++  BufferOffset as_vspltish(uint8_t vrt, int8_t simm5);
++  BufferOffset as_vspltisw(uint8_t vrt, int8_t simm5);
++
++  // VA-form ternary VMX instructions.
++  BufferOffset as_vmladduhm(uint8_t vrt, uint8_t vra, uint8_t vrb, uint8_t vrc);
++  BufferOffset as_vmhraddshs(uint8_t vrt, uint8_t vra, uint8_t vrb,
++                             uint8_t vrc);
++  BufferOffset as_vmsumshm(uint8_t vrt, uint8_t vra, uint8_t vrb, uint8_t vrc);
++  BufferOffset as_vmsumuhm(uint8_t vrt, uint8_t vra, uint8_t vrb, uint8_t vrc);
++  BufferOffset as_xxpermdi(FloatRegister xt, FloatRegister xa, FloatRegister xb,
++                           uint8_t dm);
++  BufferOffset as_xxspltw(FloatRegister xt, FloatRegister xb, uint8_t uim);
++  // POWER9 (ISA 3.0). Splat 8-bit immediate to all 16 bytes of an FPR-encoded
++  // VSR (TX bit forced 0). XX1-form, no Rc.
++  BufferOffset as_xxspltib(FloatRegister xt, uint8_t imm8);
++  BufferOffset as_xxinsertw(FloatRegister xt, FloatRegister xb, uint8_t uim);
++  BufferOffset as_xxextractuw(FloatRegister xt, FloatRegister xb, uint8_t uim);
++  BufferOffset as_mtvsrdd(FloatRegister xt, Register ra, Register rb);
++  BufferOffset as_mfvsrld(Register rt, FloatRegister xs);
++
++  // VMX vector operations.
++  BufferOffset as_vspltb(FloatRegister vrt, FloatRegister vrb, uint8_t uim);
++  BufferOffset as_vsplth(FloatRegister vrt, FloatRegister vrb, uint8_t uim);
++  BufferOffset as_vsldoi(FloatRegister vrt, FloatRegister vra,
++                         FloatRegister vrb, uint8_t shb);
++
++  // Barrier and sync instructions.
++  BufferOffset as_lwsync();
++  BufferOffset as_sync();
++  BufferOffset as_isync();
++
++  // Convenience pseudo-instructions.
++  BufferOffset xs_trap();
++  BufferOffset xs_trap_tagged(TrapTag tag);
++  BufferOffset xs_mr(Register rd, Register ra);
++  BufferOffset xs_mtctr(Register ra);
++  BufferOffset xs_mtlr(Register ra);
++  BufferOffset xs_mflr(Register rd);
++  BufferOffset xs_mtcr(Register rs);
++  BufferOffset xs_mfxer(Register ra);
++  BufferOffset xs_mtxer(Register ra);
++  BufferOffset xs_li(Register rd, int16_t im);
++  BufferOffset xs_lis(Register rd, int16_t im);
++  BufferOffset x_subi(Register rd, Register ra, int16_t im);
++  BufferOffset x_not(Register rd, Register ra);
++  BufferOffset x_slwi(Register rd, Register rs, int n);
++  BufferOffset x_sldi(Register rd, Register rs, int n);
++  BufferOffset x_srwi(Register rd, Register rs, int n);
++  BufferOffset x_srdi(Register rd, Register rs, int n);
++  BufferOffset x_insertbits0_15(Register rd, Register rs);
++  BufferOffset x_bit_value(Register rd, Register rs, unsigned bit);
++  BufferOffset x_sr_mulli(Register rd, Register ra, int16_t im);
++
++  // --- Label operations.
++  void bind(Label* label) { bind(label, nextOffset()); }
++  void bind(Label* label, BufferOffset boff);
++  void bind(InstImm* inst, uintptr_t branch, uintptr_t target);
++  void bind(CodeLabel* label) { label->target()->bind(currentOffset()); }
++  uint32_t currentOffset() { return nextOffset().getOffset(); }
++  void retarget(Label* label, Label* target);
++  void call(Label* label);
++  void call(void* target);
++
++  void as_break(uint32_t code);
++
++  // --- Static capability queries.
++  static bool SupportsFloatingPoint() { return true; }
++  static bool SupportsWasmSimd() { return true; }
++  static bool SupportsUnalignedAccesses() { return true; }
++  static bool SupportsFastUnalignedFPAccesses() { return true; }
++  // POWER9 has scalar FP16 hardware (xscvdphp/xscvhpdp); POWER8 doesn't.
++  // Runtime-gate like x86's SupportsFloat32To16 (which keys off F16C).
++  static bool SupportsFloat64To16() { return HasPOWER9(); }
++  static bool SupportsFloat32To16() { return HasPOWER9(); }
++  static bool HasRoundInstruction(RoundingMode mode) {
++    // PPC64 has friz (trunc), frip (ceil), frim (floor), which are all correct.
++    // frin (round-to-nearest) does NOT implement proper IEEE banker's rounding
++    // (ties to even), so NearestTiesToEven is not supported.
++    return mode == RoundingMode::TowardsZero || mode == RoundingMode::Up ||
++           mode == RoundingMode::Down;
++  }
++
++ protected:
++  InstImm invertBranch(InstImm branch, BOffImm16 skipOffset);
++  void addPendingJump(BufferOffset src, ImmPtr target, RelocationKind kind) {
++    enoughMemory_ &= jumps_.append(RelativePatch(src, target.value, kind));
++    if (kind == RelocationKind::JITCODE) {
++      writeRelocation(src);
++    }
++  }
++  void addLongJump(BufferOffset src, BufferOffset dst) {
++    CodeLabel cl;
++    cl.patchAt()->bind(src.getOffset());
++    cl.target()->bind(dst.getOffset());
++    cl.setLinkMode(CodeLabel::JumpImmediate);
++    addCodeLabel(std::move(cl));
++  }
++
++ public:
++  void flushBuffer() { m_buffer.flushPool(); }
++  void comment(const char* msg) { spew("; %s", msg); }
++  static uint32_t NopSize() { return 4; }
++
++  // --- Static patching API.
++  static uint64_t ExtractLoad64Value(Instruction* inst0);
++  static void UpdateLoad64Value(Instruction* inst0, uint64_t value);
++  static void WriteLoad64Instructions(Instruction* inst0, Register reg,
++                                      uint64_t value);
++
++  static void PatchWrite_Imm32(CodeLocationLabel label, Imm32 imm);
++  static uint8_t* NextInstruction(uint8_t* instruction,
++                                  uint32_t* count = nullptr);
++  static void ToggleToJmp(CodeLocationLabel inst_);
++  static void ToggleToCmp(CodeLocationLabel inst_);
++
++  void verifyHeapAccessDisassembly(uint32_t begin, uint32_t end,
++                                   const Disassembler::HeapAccess& ha) {}
++
++  // --- Public patching API (required by shared code).
++  static void Bind(uint8_t* rawCode, const CodeLabel& label);
++  void processCodeLabels(uint8_t* rawCode);
++
++  static void TraceJumpRelocations(JSTracer* trc, JitCode* code,
++                                   CompactBufferReader& reader);
++  static void TraceDataRelocations(JSTracer* trc, JitCode* code,
++                                   CompactBufferReader& reader);
++
++  void executableCopy(uint8_t* buffer);
++
++  static uint32_t PatchWrite_NearCallSize();
++  static void PatchWrite_NearCall(CodeLocationLabel start,
++                                  CodeLocationLabel toCall);
++  static void PatchDataWithValueCheck(CodeLocationLabel label, ImmPtr newValue,
++                                      ImmPtr expectedValue);
++  static void PatchDataWithValueCheck(CodeLocationLabel label,
++                                      PatchedImmPtr newValue,
++                                      PatchedImmPtr expectedValue);
++  static void ToggleCall(CodeLocationLabel inst_, bool enabled);
++
++ private:
++  GeneralRegisterSet scratch_register_list_;
++
++ public:
++  GeneralRegisterSet* GetScratchRegisterList() {
++    return &scratch_register_list_;
++  }
++};  // Assembler
++
++inline bool IsUnaligned(const wasm::MemoryAccessDesc& access) {
++  if (!access.align()) {
++    return false;
++  }
++  return access.align() < access.byteSize();
++}
++
++}  // namespace jit
++}  // namespace js
++
++// Whether an Imm32 fits in an unsigned 16-bit immediate.
++#define PPC_IMM_OK_U(x) (MOZ_LIKELY(((x).value & 0xffff0000) == 0))
++
++// Whether an Imm32 fits in a signed 16-bit immediate.
++#define PPC_IMM_OK_S(x)                        \
++  (MOZ_LIKELY(((x).value & 0xffff8000) == 0 || \
++              ((x).value & 0xffff8000) == 0xffff8000))
++
++// Whether the offset part of an Address fits in a signed 16-bit immediate.
++#define PPC_OFFS_OK(x)                          \
++  (MOZ_LIKELY(((x).offset & 0xffff8000) == 0 || \
++              ((x).offset & 0xffff8000) == 0xffff8000))
++
++// Same test but checking a bit ahead (for paired loads).
++#define PPC_OFFS_INCR_OK(x, incr)                          \
++  (MOZ_LIKELY((((x).offset + (incr)) & 0xffff8000) == 0 || \
++              (((x).offset + (incr)) & 0xffff8000) == 0xffff8000))
++
++#endif /* jit_ppc64_Assembler_ppc64_h */
+diff --git a/js/src/jit/ppc64/CodeGenerator-ppc64.cpp b/js/src/jit/ppc64/CodeGenerator-ppc64.cpp
+new file mode 100644
+index 000000000000..0a436fb1201a
+--- /dev/null
++++ b/js/src/jit/ppc64/CodeGenerator-ppc64.cpp
+@@ -0,0 +1,3647 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#include "jit/ppc64/CodeGenerator-ppc64.h"
++
++#include "mozilla/MathAlgorithms.h"
++
++#include <bit>
++
++#include "builtin/Number.h"
++#include "jit/CodeGenerator.h"
++#include "jit/InlineScriptTree.h"
++#include "jit/JitRuntime.h"
++#include "jit/MIR-wasm.h"
++#include "jit/MIR.h"
++#include "jit/MIRGraph.h"
++#include "vm/JSContext.h"
++#include "vm/Realm.h"
++#include "vm/Shape.h"
++
++#include "jit/shared/CodeGenerator-shared-inl.h"
++#include "vm/JSScript-inl.h"
++
++using namespace js;
++using namespace js::jit;
++
++using JS::GenericNaN;
++using mozilla::NegativeInfinity;
++
++namespace js {
++namespace jit {
++
++CodeGeneratorPPC64::CodeGeneratorPPC64(MIRGenerator* gen, LIRGraph* graph,
++                                       MacroAssembler* masm,
++                                       const wasm::CodeMetadata* codeMeta)
++    : CodeGeneratorShared(gen, graph, masm, codeMeta) {}
++
++Operand CodeGeneratorPPC64::ToOperand(const LAllocation& a) {
++  if (a.isGeneralReg()) {
++    return Operand(a.toGeneralReg()->reg());
++  }
++  if (a.isFloatReg()) {
++    return Operand(a.toFloatReg()->reg());
++  }
++  return Operand(ToAddress(a));
++}
++
++Operand CodeGeneratorPPC64::ToOperand(const LAllocation* a) {
++  return ToOperand(*a);
++}
++
++MoveOperand CodeGeneratorPPC64::toMoveOperand(LAllocation a) const {
++  if (a.isGeneralReg()) {
++    return MoveOperand(ToRegister(a));
++  }
++  if (a.isFloatReg()) {
++    return MoveOperand(ToFloatRegister(a));
++  }
++  MoveOperand::Kind kind = a.isStackArea() ? MoveOperand::Kind::EffectiveAddress
++                                           : MoveOperand::Kind::Memory;
++  Address address = ToAddress(a);
++  MOZ_ASSERT((address.offset & 3) == 0);
++  return MoveOperand(address, kind);
++}
++
++void CodeGeneratorPPC64::bailoutFrom(Label* label, LSnapshot* snapshot) {
++  MOZ_ASSERT_IF(!masm.oom(), label->used());
++  MOZ_ASSERT_IF(!masm.oom(), !label->bound());
++
++  encode(snapshot);
++
++  InlineScriptTree* tree = snapshot->mir()->block()->trackedTree();
++  auto* ool = new (alloc()) LambdaOutOfLineCode([=, this](OutOfLineCode& ool) {
++    // Push snapshotOffset and make sure stack is aligned.
++    masm.subPtr(Imm32(sizeof(Value)), StackPointer);
++    masm.storePtr(ImmWord(snapshot->snapshotOffset()),
++                  Address(StackPointer, 0));
++    masm.jump(&deoptLabel_);
++  });
++  addOutOfLineCode(ool,
++                   new (alloc()) BytecodeSite(tree, tree->script()->code()));
++
++  masm.retarget(label, ool->entry());
++}
++
++void CodeGeneratorPPC64::bailout(LSnapshot* snapshot) {
++  Label label;
++  masm.jump(&label);
++  bailoutFrom(&label, snapshot);
++}
++
++void CodeGeneratorPPC64::bailoutIfFalseBool(Register lhs, LSnapshot* snapshot) {
++  Label bail;
++  masm.branchTest32(Assembler::Zero, lhs, Imm32(0xFF), &bail);
++  bailoutFrom(&bail, snapshot);
++}
++
++bool CodeGeneratorPPC64::generateOutOfLineCode() {
++  if (!CodeGeneratorShared::generateOutOfLineCode()) {
++    return false;
++  }
++
++  if (deoptLabel_.used()) {
++    masm.bind(&deoptLabel_);
++
++    // Frame size is stored in LR and pushed by GenerateBailoutThunk
++    // (via PushBailoutFrame -> pushReturnAddress -> mflr).
++    {
++      UseScratchRegisterScope temps(masm);
++      Register scratch = temps.Acquire();
++      masm.movePtr(ImmWord(frameSize()), scratch);
++      masm.xs_mtlr(scratch);
++    }
++
++    TrampolinePtr handler = gen->jitRuntime()->getGenericBailoutHandler();
++    masm.jump(handler);
++  }
++
++  return !masm.oom();
++}
++
++void CodeGeneratorPPC64::branchToBlock(MBasicBlock* block) {
++  Label* label = skipTrivialBlocks(block)->lir()->label();
++  masm.jump(label);
++}
++
++void CodeGeneratorPPC64::branchToBlock(Assembler::DoubleCondition cond,
++                                       FloatRegister lhs, FloatRegister rhs,
++                                       MBasicBlock* mir) {
++  Label* label = skipTrivialBlocks(mir)->lir()->label();
++  masm.branchDouble(cond, lhs, rhs, label);
++}
++
++void CodeGeneratorPPC64::branchToBlock(Assembler::FloatFormat fmt,
++                                       Assembler::DoubleCondition cond,
++                                       FloatRegister lhs, FloatRegister rhs,
++                                       MBasicBlock* mir) {
++  Label* label = skipTrivialBlocks(mir)->lir()->label();
++  if (fmt == Assembler::DoubleFloat) {
++    masm.branchDouble(cond, lhs, rhs, label);
++  } else {
++    masm.branchFloat(cond, lhs, rhs, label);
++  }
++}
++
++class OutOfLineTableSwitch : public OutOfLineCodeBase<CodeGeneratorPPC64> {
++  MTableSwitch* mir_;
++  CodeLabel jumpLabel_;
++
++  void accept(CodeGeneratorPPC64* codegen) {
++    codegen->visitOutOfLineTableSwitch(this);
++  }
++
++ public:
++  explicit OutOfLineTableSwitch(MTableSwitch* mir) : mir_(mir) {}
++
++  MTableSwitch* mir() const { return mir_; }
++  CodeLabel* jumpLabel() { return &jumpLabel_; }
++};
++
++void CodeGeneratorPPC64::emitTableSwitchDispatch(MTableSwitch* mir,
++                                                 Register index,
++                                                 Register base) {
++  Label* defaultcase = skipTrivialBlocks(mir->getDefault())->lir()->label();
++
++  if (mir->low() != 0) {
++    masm.subPtr(Imm32(mir->low()), index);
++  }
++
++  int32_t cases = mir->numCases();
++  masm.branchPtr(Assembler::AboveOrEqual, index, ImmWord(cases), defaultcase);
++
++  OutOfLineTableSwitch* ool = new (alloc()) OutOfLineTableSwitch(mir);
++  addOutOfLineCode(ool, mir);
++
++  masm.mov(ool->jumpLabel(), base);
++
++  BaseIndex pointer(base, index, ScalePointer);
++  masm.branchToComputedAddress(pointer);
++}
++
++void CodeGeneratorPPC64::generateInvalidateEpilogue() {
++  // Pad with enough nops so that PatchWrite_NearCall on the last OSI point
++  // cannot overlap the invalidation epilogue. The patch area is
++  // PatchWrite_NearCallSize (40) bytes; the last OSI point could be right
++  // before this epilogue.
++  for (size_t i = 0; i < Assembler::PatchWrite_NearCallSize();
++       i += Assembler::NopSize()) {
++    masm.nop();
++  }
++
++  masm.bind(&invalidate_);
++
++  // Push the return address (LR) onto the stack.
++  masm.pushReturnAddress();
++
++  invalidateEpilogueData_ = masm.pushWithPatch(ImmWord(uintptr_t(-1)));
++
++  TrampolinePtr thunk = gen->jitRuntime()->getInvalidationThunk();
++  masm.jump(thunk);
++}
++
++void CodeGeneratorPPC64::visitOutOfLineTableSwitch(OutOfLineTableSwitch* ool) {
++  MTableSwitch* mir = ool->mir();
++
++  masm.haltingAlign(sizeof(void*));
++  masm.bind(ool->jumpLabel());
++  masm.addCodeLabel(*ool->jumpLabel());
++
++  for (size_t i = 0; i < mir->numCases(); i++) {
++    LBlock* caseblock = skipTrivialBlocks(mir->getCase(i))->lir();
++    Label* caseheader = caseblock->label();
++    uint32_t caseoffset = caseheader->offset();
++
++    CodeLabel cl;
++    masm.writeCodePointer(&cl);
++    cl.target()->bind(caseoffset);
++    masm.addCodeLabel(cl);
++  }
++}
++
++void CodeGeneratorPPC64::visitOutOfLineWasmTruncateCheck(
++    OutOfLineWasmTruncateCheck* ool) {
++  if (ool->toType() == MIRType::Int32) {
++    masm.outOfLineWasmTruncateToInt32Check(ool->input(), ool->output(),
++                                           ool->fromType(), ool->flags(),
++                                           ool->rejoin(), ool->trapSiteDesc());
++  } else {
++    MOZ_ASSERT(ool->toType() == MIRType::Int64);
++    masm.outOfLineWasmTruncateToInt64Check(ool->input(), ool->output64(),
++                                           ool->fromType(), ool->flags(),
++                                           ool->rejoin(), ool->trapSiteDesc());
++  }
++}
++
++void CodeGeneratorPPC64::emitBigIntPtrDiv(LBigIntPtrDiv* ins, Register dividend,
++                                          Register divisor, Register output) {
++  masm.as_divd(output, dividend, divisor);
++}
++
++void CodeGeneratorPPC64::emitBigIntPtrMod(LBigIntPtrMod* ins, Register dividend,
++                                          Register divisor, Register output) {
++  if (HasPOWER9()) {
++    masm.as_modsd(output, dividend, divisor);
++  } else {
++    masm.as_divd(output, dividend, divisor);
++    masm.as_mulld(output, output, divisor);
++    masm.as_subf(output, output, dividend);
++  }
++}
++
++// ===============================================================
++// Visitors: Box/Unbox
++
++void CodeGenerator::visitBox(LBox* box) {
++  const LAllocation* in = box->getOperand(0);
++  ValueOperand result = ToOutValue(box);
++
++  masm.moveValue(TypedOrValueRegister(box->type(), ToAnyRegister(in)), result);
++}
++
++void CodeGenerator::visitUnbox(LUnbox* unbox) {
++  MUnbox* mir = unbox->mir();
++
++  Register result = ToRegister(unbox->output());
++
++  if (mir->fallible()) {
++    ValueOperand value = ToValue(unbox->input());
++    Label bail;
++    switch (mir->type()) {
++      case MIRType::Int32:
++        masm.fallibleUnboxInt32(value, result, &bail);
++        break;
++      case MIRType::Boolean:
++        masm.fallibleUnboxBoolean(value, result, &bail);
++        break;
++      case MIRType::Object:
++        masm.fallibleUnboxObject(value, result, &bail);
++        break;
++      case MIRType::String:
++        masm.fallibleUnboxString(value, result, &bail);
++        break;
++      case MIRType::Symbol:
++        masm.fallibleUnboxSymbol(value, result, &bail);
++        break;
++      case MIRType::BigInt:
++        masm.fallibleUnboxBigInt(value, result, &bail);
++        break;
++      default:
++        MOZ_CRASH("Given MIRType cannot be unboxed.");
++    }
++    bailoutFrom(&bail, unbox->snapshot());
++    return;
++  }
++
++  LAllocation* input = unbox->getOperand(LUnbox::Input);
++  if (input->isGeneralReg()) {
++    Register inputReg = ToRegister(input);
++    switch (mir->type()) {
++      case MIRType::Int32:
++        masm.unboxInt32(ValueOperand(inputReg), result);
++        break;
++      case MIRType::Boolean:
++        masm.unboxBoolean(ValueOperand(inputReg), result);
++        break;
++      case MIRType::Object:
++        masm.unboxObject(ValueOperand(inputReg), result);
++        break;
++      case MIRType::String:
++        masm.unboxString(ValueOperand(inputReg), result);
++        break;
++      case MIRType::Symbol:
++        masm.unboxSymbol(ValueOperand(inputReg), result);
++        break;
++      case MIRType::BigInt:
++        masm.unboxBigInt(ValueOperand(inputReg), result);
++        break;
++      default:
++        MOZ_CRASH("Given MIRType cannot be unboxed.");
++    }
++    return;
++  }
++
++  Address inputAddr = ToAddress(input);
++  switch (mir->type()) {
++    case MIRType::Int32:
++      masm.unboxInt32(inputAddr, result);
++      break;
++    case MIRType::Boolean:
++      masm.unboxBoolean(inputAddr, result);
++      break;
++    case MIRType::Object:
++      masm.unboxObject(inputAddr, result);
++      break;
++    case MIRType::String:
++      masm.unboxString(inputAddr, result);
++      break;
++    case MIRType::Symbol:
++      masm.unboxSymbol(inputAddr, result);
++      break;
++    case MIRType::BigInt:
++      masm.unboxBigInt(inputAddr, result);
++      break;
++    default:
++      MOZ_CRASH("Given MIRType cannot be unboxed.");
++  }
++}
++
++// ===============================================================
++// Visitors: Integer Arithmetic
++
++void CodeGenerator::visitAddI(LAddI* ins) {
++  LAllocation* lhs = ins->getOperand(0);
++  LAllocation* rhs = ins->getOperand(1);
++  Register dest = ToRegister(ins->getDef(0));
++
++  if (rhs->isConstant()) {
++    Imm32 imm(ToInt32(rhs));
++    if (ins->snapshot()) {
++      masm.move32(ToRegister(lhs), dest);
++      Label overflow;
++      masm.branchAdd32(Assembler::Overflow, imm, dest, &overflow);
++      bailoutFrom(&overflow, ins->snapshot());
++    } else {
++      masm.add32(imm, ToRegister(lhs), dest);
++    }
++  } else {
++    Register rhsReg = ToRegister(rhs);
++    if (ins->snapshot()) {
++      // Use 3-operand add to avoid clobbering rhs when rhs == dest.
++      masm.as_add(dest, ToRegister(lhs), rhsReg);
++      // Check 32-bit overflow: sign-extend lower 32 and compare.
++      masm.as_extsw(SecondScratchReg, dest);
++      Label overflow;
++      masm.as_cmpd(dest, SecondScratchReg);
++      masm.ma_b(Assembler::NotEqual, &overflow);
++      masm.as_extsw(dest, dest);
++      bailoutFrom(&overflow, ins->snapshot());
++    } else {
++      masm.as_add(dest, ToRegister(lhs), rhsReg);
++      masm.as_extsw(dest, dest);
++    }
++  }
++}
++
++void CodeGenerator::visitAddIntPtr(LAddIntPtr* ins) {
++  Register dest = ToRegister(ins->getDef(0));
++  Register lhs = ToRegister(ins->getOperand(0));
++  const LAllocation* rhs = ins->getOperand(1);
++
++  if (rhs->isConstant()) {
++    if (lhs != dest) {
++      masm.movePtr(lhs, dest);
++    }
++    masm.addPtr(ImmWord(ToIntPtr(rhs)), dest);
++  } else {
++    masm.as_add(dest, lhs, ToRegister(rhs));
++  }
++}
++
++void CodeGenerator::visitAddI64(LAddI64* lir) {
++  Register dest = ToRegister(lir->getDef(0));
++  Register lhs = ToRegister(lir->getOperand(0));
++  const LAllocation* rhs = lir->getOperand(1);
++
++  if (rhs->isConstant()) {
++    if (lhs != dest) {
++      masm.movePtr(lhs, dest);
++    }
++    masm.addPtr(ImmWord(ToInt64(rhs)), dest);
++  } else {
++    masm.as_add(dest, lhs, ToRegister(rhs));
++  }
++}
++
++void CodeGenerator::visitSubI(LSubI* ins) {
++  LAllocation* lhs = ins->getOperand(0);
++  LAllocation* rhs = ins->getOperand(1);
++  Register dest = ToRegister(ins->getDef(0));
++
++  if (rhs->isConstant()) {
++    Imm32 imm(ToInt32(rhs));
++    if (ins->snapshot()) {
++      masm.move32(ToRegister(lhs), dest);
++      Label overflow;
++      masm.branchSub32(Assembler::Overflow, imm, dest, &overflow);
++      bailoutFrom(&overflow, ins->snapshot());
++    } else {
++      masm.move32(ToRegister(lhs), dest);
++      masm.sub32(imm, dest);
++    }
++  } else {
++    Register rhsReg = ToRegister(rhs);
++    if (ins->snapshot()) {
++      // as_subf(d, a, b) computes d = b - a, so subf(dest, rhs, lhs) = lhs -
++      // rhs
++      masm.as_subf(dest, rhsReg, ToRegister(lhs));
++      masm.as_extsw(SecondScratchReg, dest);
++      Label overflow;
++      masm.as_cmpd(dest, SecondScratchReg);
++      masm.ma_b(Assembler::NotEqual, &overflow);
++      masm.as_extsw(dest, dest);
++      bailoutFrom(&overflow, ins->snapshot());
++    } else {
++      masm.as_subf(dest, rhsReg, ToRegister(lhs));
++      masm.as_extsw(dest, dest);
++    }
++  }
++}
++
++void CodeGenerator::visitSubIntPtr(LSubIntPtr* ins) {
++  Register dest = ToRegister(ins->getDef(0));
++  Register lhs = ToRegister(ins->getOperand(0));
++  const LAllocation* rhs = ins->getOperand(1);
++
++  if (rhs->isConstant()) {
++    if (lhs != dest) {
++      masm.movePtr(lhs, dest);
++    }
++    masm.subPtr(Imm32(ToIntPtr(rhs)), dest);
++  } else {
++    // as_subf(d, a, b) = b - a
++    masm.as_subf(dest, ToRegister(rhs), lhs);
++  }
++}
++
++void CodeGenerator::visitSubI64(LSubI64* lir) {
++  Register dest = ToRegister(lir->getDef(0));
++  Register lhs = ToRegister(lir->getOperand(0));
++  const LAllocation* rhs = lir->getOperand(1);
++
++  if (rhs->isConstant()) {
++    if (lhs != dest) {
++      masm.movePtr(lhs, dest);
++    }
++    masm.sub64(Imm64(ToInt64(rhs)), Register64(dest));
++  } else {
++    // as_subf(d, a, b) = b - a
++    masm.as_subf(dest, ToRegister(rhs), lhs);
++  }
++}
++
++void CodeGenerator::visitMulI(LMulI* ins) {
++  Register dest = ToRegister(ins->getDef(0));
++  Register lhs = ToRegister(ins->getOperand(0));
++  const LAllocation* rhs = ins->getOperand(1);
++  MMul* mul = ins->mir();
++
++  if (rhs->isConstant()) {
++    int32_t constant = ToInt32(rhs);
++    Register src = lhs;
++
++    // Bailout on -0.0 before the special-case handling below, since cases
++    // like -1 and 0 return early and would skip the check.
++    if (mul->canBeNegativeZero() && constant <= 0) {
++      Assembler::Condition cond =
++          (constant == 0) ? Assembler::Signed : Assembler::Equal;
++      bailoutCmp32(cond, src, Imm32(0), ins->snapshot());
++    }
++
++    switch (constant) {
++      case -1:
++        if (mul->canOverflow()) {
++          Label ok;
++          masm.branchPtr(Assembler::NotEqual, lhs, ImmWord(INT32_MIN), &ok);
++          bailout(ins->snapshot());
++          masm.bind(&ok);
++        }
++        masm.as_neg(dest, src);
++        masm.as_extsw(dest, dest);
++        return;
++      case 0:
++        masm.move32(Imm32(0), dest);
++        return;
++      case 1:
++        masm.move32(src, dest);
++        return;
++      case 2:
++        if (mul->canOverflow()) {
++          masm.move32(src, dest);
++          Label overflow;
++          masm.branchAdd32(Assembler::Overflow, dest, dest, &overflow);
++          bailoutFrom(&overflow, ins->snapshot());
++        } else {
++          masm.move32(src, dest);
++          masm.add32(dest, dest);
++        }
++        return;
++      default:
++        break;
++    }
++
++    // Check for power of 2 (positive).
++    uint32_t absCst = mozilla::Abs(constant);
++    if (absCst > 0 && (absCst & (absCst - 1)) == 0 && !mul->canOverflow()) {
++      uint32_t shift = mozilla::FloorLog2(absCst);
++      masm.x_slwi(dest, src, shift);
++      if (constant < 0) {
++        masm.as_neg(dest, dest);
++      }
++      masm.as_extsw(dest, dest);
++      return;
++    }
++
++    // General case.
++    if (mul->canOverflow()) {
++      masm.move32(src, dest);
++      Label overflow;
++      masm.branchMul32(Assembler::Overflow, Imm32(constant), dest, &overflow);
++      bailoutFrom(&overflow, ins->snapshot());
++    } else {
++      masm.move32(src, dest);
++      masm.mul32(Imm32(constant), dest);
++    }
++
++    // Check for negative zero (for constants not handled above).
++    if (mul->canBeNegativeZero() && constant < 0) {
++      Label ok;
++      masm.branchPtr(Assembler::NotEqual, dest, ImmWord(0), &ok);
++      bailoutCmp32(Assembler::Signed, src, src, ins->snapshot());
++      masm.bind(&ok);
++    }
++    return;
++  }
++
++  Register rhsReg = ToRegister(rhs);
++
++  if (mul->canOverflow()) {
++    // Use 64-bit multiply so the full result is deterministic, then check
++    // whether truncating to 32 bits changes the value. Match the
++    // visitAddI/visitSubI ordering: branch first, truncate only on the
++    // success path (the bailout discards dest anyway). extsw is
++    // non-recording (ISA v3.0B) so it doesn't disturb CR0
++    // either way; the choice is for consistency.
++    masm.as_mulld(dest, lhs, rhsReg);
++    masm.as_extsw(SecondScratchReg, dest);
++    Label overflow;
++    masm.as_cmpd(dest, SecondScratchReg);
++    masm.ma_b(Assembler::NotEqual, &overflow);
++    masm.as_extsw(dest, dest);
++    bailoutFrom(&overflow, ins->snapshot());
++  } else {
++    masm.as_mullw(dest, lhs, rhsReg);
++    masm.as_extsw(dest, dest);
++  }
++
++  if (mul->canBeNegativeZero()) {
++    Label done;
++    masm.branchPtr(Assembler::NotEqual, dest, ImmWord(0), &done);
++    // Result is 0. Check if lhs|rhs was negative.
++    {
++      UseScratchRegisterScope temps(masm);
++      Register scratch = temps.Acquire();
++      masm.as_or_(scratch, lhs, rhsReg);
++      bailoutCmp32(Assembler::Signed, scratch, scratch, ins->snapshot());
++    }
++    masm.bind(&done);
++  }
++}
++
++void CodeGenerator::visitMulIntPtr(LMulIntPtr* ins) {
++  Register dest = ToRegister(ins->getDef(0));
++  Register lhs = ToRegister(ins->getOperand(0));
++  const LAllocation* rhs = ins->getOperand(1);
++
++  if (rhs->isConstant()) {
++    if (lhs != dest) {
++      masm.movePtr(lhs, dest);
++    }
++    masm.mulPtr(ImmWord(ToIntPtr(rhs)), dest);
++  } else {
++    masm.as_mulld(dest, lhs, ToRegister(rhs));
++  }
++}
++
++void CodeGenerator::visitMulI64(LMulI64* lir) {
++  Register dest = ToRegister(lir->getDef(0));
++  Register lhs = ToRegister(lir->getOperand(0));
++  const LAllocation* rhs = lir->getOperand(1);
++
++  if (rhs->isConstant()) {
++    if (lhs != dest) {
++      masm.movePtr(lhs, dest);
++    }
++    masm.mulPtr(ImmWord(ToInt64(rhs)), dest);
++  } else {
++    masm.as_mulld(dest, lhs, ToRegister(rhs));
++  }
++}
++
++void CodeGenerator::visitDivI(LDivI* ins) {
++  Register lhs = ToRegister(ins->lhs());
++  Register rhs = ToRegister(ins->rhs());
++  Register dest = ToRegister(ins->output());
++  Register temp = ToRegister(ins->temp0());
++  MDiv* mir = ins->mir();
++
++  Label done;
++
++  // Handle divide by zero.
++  if (mir->canBeDivideByZero()) {
++    if (mir->trapOnError()) {
++      Label nonZero;
++      masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(0), &nonZero);
++      masm.wasmTrap(wasm::Trap::IntegerDivideByZero, mir->trapSiteDesc());
++      masm.bind(&nonZero);
++    } else if (mir->canTruncateInfinities()) {
++      Label nonZero;
++      masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(0), &nonZero);
++      masm.move32(Imm32(0), dest);
++      masm.jump(&done);
++      masm.bind(&nonZero);
++    } else {
++      MOZ_ASSERT(mir->fallible());
++      bailoutCmp32(Assembler::Equal, rhs, Imm32(0), ins->snapshot());
++    }
++  }
++
++  // Handle INT32_MIN / -1 overflow.
++  if (mir->canBeNegativeOverflow()) {
++    Label notMinInt;
++    masm.branchPtr(Assembler::NotEqual, lhs, ImmWord(INT32_MIN), &notMinInt);
++    masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), &notMinInt);
++
++    if (mir->trapOnError()) {
++      masm.wasmTrap(wasm::Trap::IntegerOverflow, mir->trapSiteDesc());
++    } else if (mir->canTruncateOverflow()) {
++      masm.move32(Imm32(INT32_MIN), dest);
++      masm.jump(&done);
++    } else {
++      MOZ_ASSERT(mir->fallible());
++      bailout(ins->snapshot());
++    }
++    masm.bind(&notMinInt);
++  }
++
++  // Handle negative zero.
++  if (!mir->canTruncateNegativeZero() && mir->canBeNegativeZero()) {
++    Label ok;
++    masm.branchPtr(Assembler::NotEqual, lhs, ImmWord(0), &ok);
++    bailoutCmp32(Assembler::LessThan, rhs, Imm32(0), ins->snapshot());
++    masm.bind(&ok);
++  }
++
++  // Perform the division.
++  masm.as_divw(dest, lhs, rhs);
++  masm.as_extsw(dest, dest);
++
++  // Check remainder if not truncatable.
++  if (!mir->canTruncateRemainder()) {
++    // Compute remainder: temp = lhs - (dest * rhs)
++    masm.as_mullw(temp, dest, rhs);
++    masm.as_subf(temp, temp, lhs);  // temp = lhs - temp
++    bailoutCmp32(Assembler::NotEqual, temp, Imm32(0), ins->snapshot());
++  }
++
++  masm.bind(&done);
++}
++
++void CodeGenerator::visitDivPowTwoI(LDivPowTwoI* ins) {
++  Register lhs = ToRegister(ins->numerator());
++  Register dest = ToRegister(ins->output());
++  UseScratchRegisterScope temps(masm);
++  Register tmp = temps.Acquire();
++  int32_t shift = ins->shift();
++
++  if (shift != 0) {
++    MDiv* mir = ins->mir();
++
++    if (!mir->isTruncated()) {
++      // If remainder != 0, bailout (check lower 'shift' bits).
++      masm.x_slwi(tmp, lhs, 32 - shift);
++      bailoutCmp32(Assembler::NotEqual, tmp, Imm32(0), ins->snapshot());
++    }
++
++    if (!mir->canBeNegativeDividend()) {
++      // Non-negative dividend: simple right shift.
++      masm.as_srawi(dest, lhs, shift);
++    } else {
++      // Need rounding adjustment for negative numbers.
++      // Add (1 << shift) - 1 if lhs is negative.
++      if (shift > 1) {
++        masm.as_srawi(tmp, lhs, 31);
++        masm.as_rlwinm(tmp, tmp, 0, 32 - shift, 31);
++      } else {
++        // shift == 1: extract sign bit into bit 31
++        masm.as_rlwinm(tmp, lhs, 1, 31, 31);
++      }
++      masm.add32(lhs, tmp);
++      masm.as_srawi(dest, tmp, shift);
++    }
++  } else {
++    masm.move32(lhs, dest);
++  }
++}
++
++void CodeGenerator::visitModI(LModI* ins) {
++  Register lhs = ToRegister(ins->lhs());
++  Register rhs = ToRegister(ins->rhs());
++  Register dest = ToRegister(ins->output());
++  UseScratchRegisterScope temps(masm);
++  Register temp = temps.Acquire();
++  MMod* mir = ins->mir();
++  Label done;
++
++  // Handle divide by zero.
++  if (mir->canBeDivideByZero()) {
++    if (mir->isTruncated()) {
++      if (mir->trapOnError()) {
++        Label nonZero;
++        masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(0), &nonZero);
++        masm.wasmTrap(wasm::Trap::IntegerDivideByZero, mir->trapSiteDesc());
++        masm.bind(&nonZero);
++      } else {
++        // Truncated division by zero yields integer zero.
++        masm.move32(rhs, dest);
++        Label nonZero;
++        masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(0), &nonZero);
++        masm.jump(&done);
++        masm.bind(&nonZero);
++      }
++    } else {
++      MOZ_ASSERT(mir->fallible());
++      bailoutCmp32(Assembler::Equal, rhs, Imm32(0), ins->snapshot());
++    }
++  }
++
++  // Handle INT32_MIN % -1.
++  // PPC64 divw is undefined for INT32_MIN / -1 (quotient overflows), so we
++  // must return 0 explicitly.  The wasm spec also defines rem_s(MIN, -1) = 0.
++  if (!mir->isUnsigned()) {
++    Label notMinOverflow;
++    masm.branchPtr(Assembler::NotEqual, lhs, ImmWord(INT32_MIN),
++                   &notMinOverflow);
++    masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), &notMinOverflow);
++    masm.move32(Imm32(0), dest);
++    masm.jump(&done);
++    masm.bind(&notMinOverflow);
++  }
++
++  if (HasPOWER9()) {
++    masm.as_modsw(dest, lhs, rhs);
++  } else {
++    masm.as_divw(temp, lhs, rhs);
++    masm.as_mullw(temp, temp, rhs);
++    masm.as_subf(dest, temp, lhs);
++  }
++  masm.as_extsw(dest, dest);
++
++  // If X%Y == 0 and X < 0, the result is -0, and we need to bail out.
++  if (mir->canBeNegativeDividend() && !mir->isTruncated()) {
++    MOZ_ASSERT(mir->fallible());
++    Label ok;
++    masm.branchPtr(Assembler::NotEqual, dest, ImmWord(0), &ok);
++    bailoutCmp32(Assembler::Signed, lhs, Imm32(0), ins->snapshot());
++    masm.bind(&ok);
++  }
++
++  masm.bind(&done);
++}
++
++void CodeGenerator::visitModPowTwoI(LModPowTwoI* ins) {
++  Register in = ToRegister(ins->getOperand(0));
++  Register out = ToRegister(ins->getDef(0));
++  MMod* mir = ins->mir();
++  int32_t shift = ins->shift();
++  uint32_t mask = (uint32_t(1) << shift) - 1;
++
++  if (mir->canBeNegativeDividend() && !mir->isTruncated()) {
++    Label nonNeg;
++    masm.branchPtr(Assembler::NotEqual, in, ImmWord(0), &nonNeg);
++    // in == 0: mod is 0, check for negative zero.
++    bailoutCmp32(Assembler::Signed, in, in, ins->snapshot());
++    masm.bind(&nonNeg);
++  }
++
++  Label negative, done;
++  masm.branch32(Assembler::Signed, in, in, &negative);
++
++  // Positive case: just mask.
++  masm.and32(Imm32(mask), in, out);
++  masm.jump(&done);
++
++  // Negative case: negate, mask, negate back.
++  masm.bind(&negative);
++  masm.as_neg(out, in);
++  masm.and32(Imm32(mask), out);
++  masm.as_neg(out, out);
++  masm.as_extsw(out, out);
++
++  if (!mir->isTruncated() && mir->canBeNegativeDividend()) {
++    Label ok;
++    masm.branchPtr(Assembler::NotEqual, out, ImmWord(0), &ok);
++    bailout(ins->snapshot());
++    masm.bind(&ok);
++  }
++
++  masm.bind(&done);
++}
++
++void CodeGenerator::visitModMaskI(LModMaskI* ins) {
++  Register src = ToRegister(ins->input());
++  Register dest = ToRegister(ins->output());
++  Register tmp0 = ToRegister(ins->temp0());
++  Register tmp1 = ToRegister(ins->temp1());
++  MMod* mir = ins->mir();
++
++  if (!mir->isTruncated() && mir->canBeNegativeDividend()) {
++    MOZ_ASSERT(mir->fallible());
++
++    Label bail;
++    masm.ma_mod_mask(src, dest, tmp0, tmp1, ins->shift(), &bail);
++    bailoutFrom(&bail, ins->snapshot());
++  } else {
++    masm.ma_mod_mask(src, dest, tmp0, tmp1, ins->shift(), nullptr);
++  }
++}
++
++void CodeGenerator::visitNegI(LNegI* ins) {
++  Register input = ToRegister(ins->input());
++  Register output = ToRegister(ins->output());
++  masm.as_neg(output, input);
++  masm.as_extsw(output, output);
++}
++
++void CodeGenerator::visitNegI64(LNegI64* ins) {
++  Register input = ToRegister64(ins->input()).reg;
++  Register output = ToOutRegister64(ins).reg;
++  masm.as_neg(output, input);
++}
++
++void CodeGenerator::visitUDivOrMod(LUDivOrMod* ins) {
++  Register lhs = ToRegister(ins->lhs());
++  Register rhs = ToRegister(ins->rhs());
++  Register output = ToRegister(ins->output());
++  UseScratchRegisterScope temps(masm);
++  Register temp = temps.Acquire();
++  Label done;
++
++  // Division by zero check.
++  if (ins->canBeDivideByZero()) {
++    if (ins->mir()->isTruncated()) {
++      if (ins->trapOnError()) {
++        Label nonZero;
++        masm.branch32(Assembler::NotEqual, rhs, Imm32(0), &nonZero);
++        masm.wasmTrap(wasm::Trap::IntegerDivideByZero, ins->trapSiteDesc());
++        masm.bind(&nonZero);
++      } else {
++        Label nonZero;
++        masm.branch32(Assembler::NotEqual, rhs, Imm32(0), &nonZero);
++        masm.move32(Imm32(0), output);
++        masm.jump(&done);
++        masm.bind(&nonZero);
++      }
++    } else {
++      bailoutCmp32(Assembler::Equal, rhs, Imm32(0), ins->snapshot());
++    }
++  }
++
++  // Zero-extend both operands to 64 bits for unsigned divide.
++  masm.move32To64ZeroExtend(lhs, Register64(lhs));
++  masm.move32To64ZeroExtend(rhs, Register64(rhs));
++
++  if (ins->mir()->isDiv()) {
++    // Division path: compute quotient. Check remainder if needed.
++    if (!ins->mir()->toDiv()->canTruncateRemainder()) {
++      if (HasPOWER9()) {
++        masm.as_moduw(temp, lhs, rhs);
++      } else {
++        masm.as_divwu(temp, lhs, rhs);
++        masm.as_mullw(temp, temp, rhs);
++        masm.as_subf(temp, temp, lhs);
++      }
++      bailoutCmp32(Assembler::NotEqual, temp, Imm32(0), ins->snapshot());
++    }
++    masm.as_divwu(output, lhs, rhs);
++  } else {
++    // Modulo path.
++    if (HasPOWER9()) {
++      masm.as_moduw(output, lhs, rhs);
++    } else {
++      masm.as_divwu(temp, lhs, rhs);
++      masm.as_mullw(temp, temp, rhs);
++      masm.as_subf(output, temp, lhs);
++    }
++  }
++
++  masm.as_extsw(output, output);
++
++  if (!ins->mir()->isTruncated()) {
++    bailoutCmp32(Assembler::LessThan, output, Imm32(0), ins->snapshot());
++  }
++
++  masm.bind(&done);
++}
++
++void CodeGenerator::visitDivOrModI64(LDivOrModI64* lir) {
++  Register lhs = ToRegister(lir->getOperand(0));
++  Register rhs = ToRegister(lir->getOperand(1));
++  Register output = ToRegister(lir->output());
++
++  Label done;
++
++  // Division by zero trap.
++  if (lir->canBeDivideByZero()) {
++    Label nonZero;
++    masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(0), &nonZero);
++    masm.wasmTrap(wasm::Trap::IntegerDivideByZero, lir->trapSiteDesc());
++    masm.bind(&nonZero);
++  }
++
++  // INT64_MIN / -1 overflow trap (for div only).
++  if (lir->canBeNegativeOverflow()) {
++    Label notMinInt;
++    masm.branchPtr(Assembler::NotEqual, lhs, ImmWord(INT64_MIN), &notMinInt);
++    masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), &notMinInt);
++    if (lir->mir()->isDiv()) {
++      masm.wasmTrap(wasm::Trap::IntegerOverflow, lir->trapSiteDesc());
++    } else {
++      masm.movePtr(ImmWord(0), output);
++      masm.jump(&done);
++    }
++    masm.bind(&notMinInt);
++  }
++
++  if (lir->mir()->isDiv()) {
++    masm.as_divd(output, lhs, rhs);
++  } else if (HasPOWER9()) {
++    masm.as_modsd(output, lhs, rhs);
++  } else {
++    masm.as_divd(output, lhs, rhs);
++    masm.as_mulld(output, output, rhs);
++    masm.as_subf(output, output, lhs);
++  }
++
++  masm.bind(&done);
++}
++
++void CodeGenerator::visitUDivOrModI64(LUDivOrModI64* lir) {
++  Register lhs = ToRegister(lir->getOperand(0));
++  Register rhs = ToRegister(lir->getOperand(1));
++  Register output = ToRegister(lir->output());
++
++  // Division by zero trap.
++  if (lir->canBeDivideByZero()) {
++    Label nonZero;
++    masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(0), &nonZero);
++    masm.wasmTrap(wasm::Trap::IntegerDivideByZero, lir->trapSiteDesc());
++    masm.bind(&nonZero);
++  }
++
++  if (lir->mir()->isDiv()) {
++    masm.as_divdu(output, lhs, rhs);
++  } else if (HasPOWER9()) {
++    masm.as_modud(output, lhs, rhs);
++  } else {
++    masm.as_divdu(output, lhs, rhs);
++    masm.as_mulld(output, output, rhs);
++    masm.as_subf(output, output, lhs);
++  }
++}
++
++// ===============================================================
++// Visitors: Bitwise
++
++void CodeGenerator::visitBitNotI(LBitNotI* ins) {
++  Register input = ToRegister(ins->input());
++  Register dest = ToRegister(ins->output());
++  masm.as_nor(dest, input, input);
++  masm.as_extsw(dest, dest);
++}
++
++void CodeGenerator::visitBitNotI64(LBitNotI64* ins) {
++  Register input = ToRegister64(ins->input()).reg;
++  Register dest = ToOutRegister64(ins).reg;
++  masm.as_nor(dest, input, input);
++}
++
++void CodeGenerator::visitBitOpI(LBitOpI* ins) {
++  Register dest = ToRegister(ins->getDef(0));
++  Register lhs = ToRegister(ins->getOperand(0));
++  const LAllocation* rhs = ins->getOperand(1);
++
++  switch (ins->bitop()) {
++    case JSOp::BitOr:
++      if (rhs->isConstant()) {
++        masm.or32(Imm32(ToInt32(rhs)), lhs, dest);
++      } else {
++        masm.as_or_(dest, lhs, ToRegister(rhs));
++        masm.as_extsw(dest, dest);
++      }
++      break;
++    case JSOp::BitXor:
++      if (rhs->isConstant()) {
++        masm.xor32(Imm32(ToInt32(rhs)), lhs, dest);
++      } else {
++        masm.as_xor_(dest, lhs, ToRegister(rhs));
++        masm.as_extsw(dest, dest);
++      }
++      break;
++    case JSOp::BitAnd:
++      if (rhs->isConstant()) {
++        masm.and32(Imm32(ToInt32(rhs)), lhs, dest);
++      } else {
++        masm.as_and_(dest, lhs, ToRegister(rhs));
++        masm.as_extsw(dest, dest);
++      }
++      break;
++    default:
++      MOZ_CRASH("unexpected binary opcode");
++  }
++}
++
++void CodeGenerator::visitBitOpI64(LBitOpI64* lir) {
++  Register dest = ToRegister(lir->getDef(0));
++  Register lhs = ToRegister(lir->getOperand(0));
++  const LAllocation* rhs = lir->getOperand(1);
++
++  switch (lir->bitop()) {
++    case JSOp::BitOr:
++      if (rhs->isConstant()) {
++        if (lhs != dest) {
++          masm.movePtr(lhs, dest);
++        }
++        masm.or64(Imm64(ToInt64(rhs)), Register64(dest));
++      } else {
++        masm.as_or_(dest, lhs, ToRegister(rhs));
++      }
++      break;
++    case JSOp::BitXor:
++      if (rhs->isConstant()) {
++        if (lhs != dest) {
++          masm.movePtr(lhs, dest);
++        }
++        masm.xor64(Imm64(ToInt64(rhs)), Register64(dest));
++      } else {
++        masm.as_xor_(dest, lhs, ToRegister(rhs));
++      }
++      break;
++    case JSOp::BitAnd:
++      if (rhs->isConstant()) {
++        if (lhs != dest) {
++          masm.movePtr(lhs, dest);
++        }
++        masm.and64(Imm64(ToInt64(rhs)), Register64(dest));
++      } else {
++        masm.as_and_(dest, lhs, ToRegister(rhs));
++      }
++      break;
++    default:
++      MOZ_CRASH("unexpected binary opcode");
++  }
++}
++
++void CodeGenerator::visitShiftI(LShiftI* ins) {
++  Register lhs = ToRegister(ins->lhs());
++  const LAllocation* rhs = ins->rhs();
++  Register dest = ToRegister(ins->output());
++
++  if (rhs->isConstant()) {
++    int32_t shift = ToInt32(rhs) & 0x1f;
++    switch (ins->bitop()) {
++      case JSOp::Lsh:
++        if (shift) {
++          masm.lshift32(Imm32(shift), lhs, dest);
++        } else {
++          masm.move32(lhs, dest);
++        }
++        break;
++      case JSOp::Rsh:
++        if (shift) {
++          masm.rshift32Arithmetic(Imm32(shift), lhs, dest);
++        } else {
++          masm.move32(lhs, dest);
++        }
++        break;
++      case JSOp::Ursh:
++        if (shift) {
++          masm.rshift32(Imm32(shift), lhs, dest);
++        } else {
++          // x >>> 0 can produce values that need to be treated as unsigned.
++          masm.move32(lhs, dest);
++        }
++        if (ins->mir()->toUrsh()->fallible()) {
++          // x >>> 0 can produce values that don't fit in signed int32.
++          bailoutCmp32(Assembler::LessThan, dest, Imm32(0), ins->snapshot());
++        }
++        break;
++      default:
++        MOZ_CRASH("unexpected shift opcode");
++    }
++  } else {
++    Register shiftReg = ToRegister(rhs);
++    // PPC slw/srw/sraw use 6 bits of shift amount; JS requires mod 32.
++    UseScratchRegisterScope temps(masm);
++    Register masked = temps.Acquire();
++    masm.as_rlwinm(masked, shiftReg, 0, 27, 31);
++    switch (ins->bitop()) {
++      case JSOp::Lsh:
++        masm.as_slw(dest, lhs, masked);
++        masm.as_extsw(dest, dest);
++        break;
++      case JSOp::Rsh:
++        masm.as_sraw(dest, lhs, masked);
++        break;
++      case JSOp::Ursh:
++        masm.as_srw(dest, lhs, masked);
++        masm.as_extsw(dest, dest);
++        if (ins->mir()->toUrsh()->fallible()) {
++          bailoutCmp32(Assembler::LessThan, dest, Imm32(0), ins->snapshot());
++        }
++        break;
++      default:
++        MOZ_CRASH("unexpected shift opcode");
++    }
++  }
++}
++
++void CodeGenerator::visitShiftIntPtr(LShiftIntPtr* ins) {
++  Register lhs = ToRegister(ins->lhs());
++  Register dest = ToRegister(ins->output());
++
++  if (ins->rhs()->isConstant()) {
++    // ShiftIntPtr's RHS constant is IntPtr- or Int32-typed, not Int64. Use
++    // ToIntPtr() which dispatches on the underlying MIRType (the previous
++    // MConstant::toInt64() call asserted when the constant wasn't Int64).
++    int32_t shift = int32_t(ToIntPtr(ins->rhs())) & 0x3f;
++    switch (ins->bitop()) {
++      case JSOp::Lsh:
++        if (shift) {
++          masm.lshiftPtr(Imm32(shift), lhs, dest);
++        } else {
++          masm.movePtr(lhs, dest);
++        }
++        break;
++      case JSOp::Rsh:
++        if (shift) {
++          masm.rshiftPtrArithmetic(Imm32(shift), lhs, dest);
++        } else {
++          masm.movePtr(lhs, dest);
++        }
++        break;
++      case JSOp::Ursh:
++        if (shift) {
++          masm.rshiftPtr(Imm32(shift), lhs, dest);
++        } else {
++          masm.movePtr(lhs, dest);
++        }
++        break;
++      default:
++        MOZ_CRASH("unexpected shift opcode");
++    }
++  } else {
++    Register shiftReg = ToRegister(ins->rhs());
++    // sld/srd/srad use the low 7 bits of the shift count: counts >= 64
++    // produce 0 (sign-fill for srad). Mask to 6 bits for mod-64 semantics.
++    UseScratchRegisterScope temps(masm);
++    Register masked = temps.Acquire();
++    masm.as_rldicl(masked, shiftReg, 0, 58);
++    switch (ins->bitop()) {
++      case JSOp::Lsh:
++        masm.as_sld(dest, lhs, masked);
++        break;
++      case JSOp::Rsh:
++        masm.as_srad(dest, lhs, masked);
++        break;
++      case JSOp::Ursh:
++        masm.as_srd(dest, lhs, masked);
++        break;
++      default:
++        MOZ_CRASH("unexpected shift opcode");
++    }
++  }
++}
++
++void CodeGenerator::visitShiftI64(LShiftI64* lir) {
++  Register lhs = ToRegister64(lir->lhs()).reg;
++  Register dest = ToOutRegister64(lir).reg;
++  const LAllocation* rhs = lir->rhs();
++
++  if (rhs->isConstant()) {
++    int32_t shift = int32_t(rhs->toConstant()->toInt64()) & 0x3f;
++    switch (lir->bitop()) {
++      case JSOp::Lsh:
++        if (shift) {
++          masm.lshiftPtr(Imm32(shift), lhs, dest);
++        } else {
++          masm.movePtr(lhs, dest);
++        }
++        break;
++      case JSOp::Rsh:
++        if (shift) {
++          masm.rshiftPtrArithmetic(Imm32(shift), lhs, dest);
++        } else {
++          masm.movePtr(lhs, dest);
++        }
++        break;
++      case JSOp::Ursh:
++        if (shift) {
++          masm.rshiftPtr(Imm32(shift), lhs, dest);
++        } else {
++          masm.movePtr(lhs, dest);
++        }
++        break;
++      default:
++        MOZ_CRASH("unexpected shift opcode");
++    }
++  } else {
++    Register shiftReg = ToRegister(rhs);
++    // Wasm i64 shifts require shift count modulo 64. PPC64 sld/srd/srad
++    // use a 7-bit shift field, so shifts >= 64 produce 0 (or sign-fill
++    // for srad). Mask to 6 bits first.
++    UseScratchRegisterScope temps(masm);
++    Register masked = temps.Acquire();
++    masm.as_rldicl(masked, shiftReg, 0, 58);  // clrldi: keep low 6 bits
++    switch (lir->bitop()) {
++      case JSOp::Lsh:
++        masm.as_sld(dest, lhs, masked);
++        break;
++      case JSOp::Rsh:
++        masm.as_srad(dest, lhs, masked);
++        break;
++      case JSOp::Ursh:
++        masm.as_srd(dest, lhs, masked);
++        break;
++      default:
++        MOZ_CRASH("unexpected shift opcode");
++    }
++  }
++}
++
++void CodeGenerator::visitUrshD(LUrshD* ins) {
++  Register lhs = ToRegister(ins->lhs());
++  const LAllocation* rhs = ins->rhs();
++  FloatRegister dest = ToFloatRegister(ins->output());
++
++  Register temp = ToRegister(ins->temp0());
++
++  if (rhs->isConstant()) {
++    int32_t shift = ToInt32(rhs) & 0x1f;
++    if (shift) {
++      masm.rshift32(Imm32(shift), lhs, temp);
++    } else {
++      masm.move32(lhs, temp);
++    }
++  } else {
++    masm.move32(lhs, temp);
++    masm.rshift32(ToRegister(rhs), temp);
++  }
++
++  masm.convertUInt32ToDouble(temp, dest);
++}
++
++// ===============================================================
++// Visitors: Floating-point arithmetic
++
++void CodeGenerator::visitMathD(LMathD* math) {
++  FloatRegister lhs = ToFloatRegister(math->lhs());
++  FloatRegister rhs = ToFloatRegister(math->rhs());
++  FloatRegister dest = ToFloatRegister(math->output());
++
++  switch (math->jsop()) {
++    case JSOp::Add:
++      masm.as_fadd(dest, lhs, rhs);
++      break;
++    case JSOp::Sub:
++      masm.as_fsub(dest, lhs, rhs);
++      break;
++    case JSOp::Mul:
++      masm.as_fmul(dest, lhs, rhs);
++      break;
++    case JSOp::Div:
++      masm.as_fdiv(dest, lhs, rhs);
++      break;
++    default:
++      MOZ_CRASH("unexpected double opcode");
++  }
++}
++
++void CodeGenerator::visitMathF(LMathF* math) {
++  FloatRegister lhs = ToFloatRegister(math->lhs());
++  FloatRegister rhs = ToFloatRegister(math->rhs());
++  FloatRegister dest = ToFloatRegister(math->output());
++
++  switch (math->jsop()) {
++    case JSOp::Add:
++      masm.as_fadds(dest, lhs, rhs);
++      break;
++    case JSOp::Sub:
++      masm.as_fsubs(dest, lhs, rhs);
++      break;
++    case JSOp::Mul:
++      masm.as_fmuls(dest, lhs, rhs);
++      break;
++    case JSOp::Div:
++      masm.as_fdivs(dest, lhs, rhs);
++      break;
++    default:
++      MOZ_CRASH("unexpected float32 opcode");
++  }
++}
++
++void CodeGenerator::visitMinMaxD(LMinMaxD* ins) {
++  FloatRegister first = ToFloatRegister(ins->first());
++  FloatRegister second = ToFloatRegister(ins->second());
++  mozilla::DebugOnly<FloatRegister> output = ToFloatRegister(ins->output());
++
++  MOZ_ASSERT(first == output);
++  if (ins->mir()->isMax()) {
++    masm.maxDouble(second, first, /* handleNaN = */ true);
++  } else {
++    masm.minDouble(second, first, /* handleNaN = */ true);
++  }
++}
++
++void CodeGenerator::visitMinMaxF(LMinMaxF* ins) {
++  FloatRegister first = ToFloatRegister(ins->first());
++  FloatRegister second = ToFloatRegister(ins->second());
++  mozilla::DebugOnly<FloatRegister> output = ToFloatRegister(ins->output());
++
++  MOZ_ASSERT(first == output);
++  if (ins->mir()->isMax()) {
++    masm.maxFloat32(second, first, /* handleNaN = */ true);
++  } else {
++    masm.minFloat32(second, first, /* handleNaN = */ true);
++  }
++}
++
++void CodeGenerator::visitNegD(LNegD* ins) {
++  FloatRegister input = ToFloatRegister(ins->input());
++  FloatRegister output = ToFloatRegister(ins->output());
++  masm.as_fneg(output, input);
++}
++
++void CodeGenerator::visitNegF(LNegF* ins) {
++  FloatRegister input = ToFloatRegister(ins->input());
++  FloatRegister output = ToFloatRegister(ins->output());
++  masm.as_fneg(output, input);
++}
++
++void CodeGenerator::visitPowHalfD(LPowHalfD* ins) {
++  FloatRegister input = ToFloatRegister(ins->input());
++  FloatRegister output = ToFloatRegister(ins->output());
++
++  Label done, skip;
++
++  // Check for -Infinity.
++  masm.loadConstantDouble(NegativeInfinity<double>(), ScratchDoubleReg);
++  masm.branchDouble(Assembler::DoubleNotEqualOrUnordered, input,
++                    ScratchDoubleReg, &skip);
++  masm.loadConstantDouble(std::numeric_limits<double>::infinity(), output);
++  masm.jump(&done);
++
++  masm.bind(&skip);
++  // Add 0.0 to handle -0.
++  masm.loadConstantDouble(0.0, ScratchDoubleReg);
++  masm.as_fadd(output, input, ScratchDoubleReg);
++  masm.as_fsqrt(output, output);
++
++  masm.bind(&done);
++}
++
++void CodeGenerator::visitNotD(LNotD* ins) {
++  FloatRegister input = ToFloatRegister(ins->input());
++  Register dest = ToRegister(ins->output());
++
++  masm.loadConstantDouble(0.0, ScratchDoubleReg);
++  masm.as_fcmpu(input, ScratchDoubleReg);
++  masm.ma_cmp_set_dbl(dest, Assembler::DoubleEqualOrUnordered);
++}
++
++void CodeGenerator::visitNotF(LNotF* ins) {
++  FloatRegister input = ToFloatRegister(ins->input());
++  Register dest = ToRegister(ins->output());
++
++  masm.loadConstantFloat32(0.0f, ScratchFloat32Reg);
++  masm.as_fcmpu(input, ScratchFloat32Reg);
++  masm.ma_cmp_set_dbl(dest, Assembler::DoubleEqualOrUnordered);
++}
++
++// ===============================================================
++// Visitors: FP comparisons and branches
++
++void CodeGenerator::visitCompareD(LCompareD* comp) {
++  FloatRegister lhs = ToFloatRegister(comp->left());
++  FloatRegister rhs = ToFloatRegister(comp->right());
++  Register dest = ToRegister(comp->output());
++  Assembler::DoubleCondition cond =
++      comp->mir()->jsop() == JSOp::StrictEq ? Assembler::DoubleEqual
++      : comp->mir()->jsop() == JSOp::StrictNe
++          ? Assembler::DoubleNotEqualOrUnordered
++          : JSOpToDoubleCondition(comp->mir()->jsop());
++
++  masm.as_fcmpu(lhs, rhs);
++  masm.ma_cmp_set_dbl(dest, cond);
++}
++
++void CodeGenerator::visitCompareF(LCompareF* comp) {
++  FloatRegister lhs = ToFloatRegister(comp->left());
++  FloatRegister rhs = ToFloatRegister(comp->right());
++  Register dest = ToRegister(comp->output());
++  Assembler::DoubleCondition cond =
++      comp->mir()->jsop() == JSOp::StrictEq ? Assembler::DoubleEqual
++      : comp->mir()->jsop() == JSOp::StrictNe
++          ? Assembler::DoubleNotEqualOrUnordered
++          : JSOpToDoubleCondition(comp->mir()->jsop());
++
++  masm.as_fcmpu(lhs, rhs);
++  masm.ma_cmp_set_dbl(dest, cond);
++}
++
++void CodeGenerator::visitCompareDAndBranch(LCompareDAndBranch* comp) {
++  FloatRegister lhs = ToFloatRegister(comp->left());
++  FloatRegister rhs = ToFloatRegister(comp->right());
++
++  Assembler::DoubleCondition cond =
++      JSOpToDoubleCondition(comp->cmpMir()->jsop());
++  MBasicBlock* ifTrue = comp->ifTrue();
++  MBasicBlock* ifFalse = comp->ifFalse();
++
++  if (isNextBlock(ifFalse->lir())) {
++    branchToBlock(Assembler::DoubleFloat, cond, lhs, rhs, ifTrue);
++  } else {
++    branchToBlock(Assembler::DoubleFloat, Assembler::InvertCondition(cond), lhs,
++                  rhs, ifFalse);
++    jumpToBlock(ifTrue);
++  }
++}
++
++void CodeGenerator::visitCompareFAndBranch(LCompareFAndBranch* comp) {
++  FloatRegister lhs = ToFloatRegister(comp->left());
++  FloatRegister rhs = ToFloatRegister(comp->right());
++
++  Assembler::DoubleCondition cond =
++      JSOpToDoubleCondition(comp->cmpMir()->jsop());
++  MBasicBlock* ifTrue = comp->ifTrue();
++  MBasicBlock* ifFalse = comp->ifFalse();
++
++  if (isNextBlock(ifFalse->lir())) {
++    branchToBlock(Assembler::SingleFloat, cond, lhs, rhs, ifTrue);
++  } else {
++    branchToBlock(Assembler::SingleFloat, Assembler::InvertCondition(cond), lhs,
++                  rhs, ifFalse);
++    jumpToBlock(ifTrue);
++  }
++}
++
++void CodeGenerator::visitTestDAndBranch(LTestDAndBranch* test) {
++  FloatRegister input = ToFloatRegister(test->input());
++
++  MBasicBlock* ifTrue = test->ifTrue();
++  MBasicBlock* ifFalse = test->ifFalse();
++
++  masm.loadConstantDouble(0.0, ScratchDoubleReg);
++
++  if (isNextBlock(ifFalse->lir())) {
++    branchToBlock(Assembler::DoubleFloat, Assembler::DoubleNotEqual, input,
++                  ScratchDoubleReg, ifTrue);
++  } else {
++    branchToBlock(Assembler::DoubleFloat, Assembler::DoubleEqualOrUnordered,
++                  input, ScratchDoubleReg, ifFalse);
++    jumpToBlock(ifTrue);
++  }
++}
++
++void CodeGenerator::visitTestFAndBranch(LTestFAndBranch* test) {
++  FloatRegister input = ToFloatRegister(test->input());
++
++  MBasicBlock* ifTrue = test->ifTrue();
++  MBasicBlock* ifFalse = test->ifFalse();
++
++  masm.loadConstantFloat32(0.0f, ScratchFloat32Reg);
++
++  if (isNextBlock(ifFalse->lir())) {
++    branchToBlock(Assembler::SingleFloat, Assembler::DoubleNotEqual, input,
++                  ScratchFloat32Reg, ifTrue);
++  } else {
++    branchToBlock(Assembler::SingleFloat, Assembler::DoubleEqualOrUnordered,
++                  input, ScratchFloat32Reg, ifFalse);
++    jumpToBlock(ifTrue);
++  }
++}
++
++// ===============================================================
++// Visitors: Truncation
++
++void CodeGenerator::visitTruncateDToInt32(LTruncateDToInt32* ins) {
++  emitTruncateDouble(ToFloatRegister(ins->input()), ToRegister(ins->output()),
++                     ins->mir());
++}
++
++void CodeGenerator::visitTruncateFToInt32(LTruncateFToInt32* ins) {
++  emitTruncateFloat32(ToFloatRegister(ins->input()), ToRegister(ins->output()),
++                      ins->mir());
++}
++
++// ===============================================================
++// Visitors: Int64 / Wasm type conversions
++
++void CodeGenerator::visitExtendInt32ToInt64(LExtendInt32ToInt64* lir) {
++  Register input = ToRegister(lir->input());
++  Register output = ToRegister(lir->output());
++
++  if (lir->mir()->isUnsigned()) {
++    masm.move32To64ZeroExtend(input, Register64(output));
++  } else {
++    masm.as_extsw(output, input);
++  }
++}
++
++void CodeGenerator::visitWrapInt64ToInt32(LWrapInt64ToInt32* lir) {
++  const LInt64Allocation input = lir->input();
++  Register output = ToRegister(lir->output());
++
++  if (lir->mir()->bottomHalf()) {
++    if (input.value().isMemory()) {
++      masm.load32(ToAddress(input), output);
++    } else {
++      masm.move64To32(ToRegister64(input), output);
++    }
++  } else {
++    // The only producer of `bottomHalf=false` MWrapInt64ToInt32 in the
++    // current MIR pipeline is the GPR-pair argument splitter in
++    // WasmIonCompile.cpp, which is gated on JS_CODEGEN_REGISTER_PAIR
++    // (32-bit ARM only). PPC64 is 64-bit and never reaches this path.
++    // Matches the same defensive crash in x64 / ARM64 backends.
++    MOZ_CRASH("Not implemented.");
++  }
++}
++
++void CodeGenerator::visitSignExtendInt64(LSignExtendInt64* lir) {
++  Register64 input = ToRegister64(lir->input());
++  Register64 output = ToOutRegister64(lir);
++
++  switch (lir->mir()->mode()) {
++    case MSignExtendInt64::Byte:
++      masm.as_extsb(output.reg, input.reg);
++      break;
++    case MSignExtendInt64::Half:
++      masm.as_extsh(output.reg, input.reg);
++      break;
++    case MSignExtendInt64::Word:
++      masm.as_extsw(output.reg, input.reg);
++      break;
++  }
++}
++
++void CodeGenerator::visitWasmExtendU32Index(LWasmExtendU32Index* lir) {
++  Register input = ToRegister(lir->input());
++  Register output = ToRegister(lir->output());
++  masm.move32To64ZeroExtend(input, Register64(output));
++}
++
++void CodeGenerator::visitWasmWrapU32Index(LWasmWrapU32Index* lir) {
++  Register input = ToRegister(lir->input());
++  Register output = ToRegister(lir->output());
++  masm.move32(input, output);
++}
++
++void CodeGenerator::visitWasmTruncateToInt32(LWasmTruncateToInt32* lir) {
++  auto input = ToFloatRegister(lir->input());
++  auto output = ToRegister(lir->output());
++
++  MWasmTruncateToInt32* mir = lir->mir();
++  MIRType fromType = mir->input()->type();
++
++  MOZ_ASSERT(fromType == MIRType::Double || fromType == MIRType::Float32);
++
++  auto* ool = new (alloc()) OutOfLineWasmTruncateCheck(mir, input, output);
++  addOutOfLineCode(ool, mir);
++
++  Label* oolEntry = ool->entry();
++  if (mir->isUnsigned()) {
++    if (fromType == MIRType::Double) {
++      masm.wasmTruncateDoubleToUInt32(input, output, mir->isSaturating(),
++                                      oolEntry);
++    } else if (fromType == MIRType::Float32) {
++      masm.wasmTruncateFloat32ToUInt32(input, output, mir->isSaturating(),
++                                       oolEntry);
++    } else {
++      MOZ_CRASH("unexpected type");
++    }
++
++    masm.bind(ool->rejoin());
++    return;
++  }
++
++  if (fromType == MIRType::Double) {
++    masm.wasmTruncateDoubleToInt32(input, output, mir->isSaturating(),
++                                   oolEntry);
++  } else if (fromType == MIRType::Float32) {
++    masm.wasmTruncateFloat32ToInt32(input, output, mir->isSaturating(),
++                                    oolEntry);
++  } else {
++    MOZ_CRASH("unexpected type");
++  }
++
++  masm.bind(ool->rejoin());
++}
++
++void CodeGenerator::visitWasmTruncateToInt64(LWasmTruncateToInt64* lir) {
++  FloatRegister input = ToFloatRegister(lir->input());
++  Register64 output = ToOutRegister64(lir);
++
++  MWasmTruncateToInt64* mir = lir->mir();
++  MIRType fromType = mir->input()->type();
++
++  MOZ_ASSERT(fromType == MIRType::Double || fromType == MIRType::Float32);
++
++  auto* ool = new (alloc()) OutOfLineWasmTruncateCheck(mir, input, output);
++  addOutOfLineCode(ool, mir);
++
++  Label* oolEntry = ool->entry();
++  Label* oolRejoin = ool->rejoin();
++  bool isSaturating = mir->isSaturating();
++
++  if (fromType == MIRType::Double) {
++    if (mir->isUnsigned()) {
++      masm.wasmTruncateDoubleToUInt64(input, output, isSaturating, oolEntry,
++                                      oolRejoin, InvalidFloatReg);
++    } else {
++      masm.wasmTruncateDoubleToInt64(input, output, isSaturating, oolEntry,
++                                     oolRejoin, InvalidFloatReg);
++    }
++  } else {
++    if (mir->isUnsigned()) {
++      masm.wasmTruncateFloat32ToUInt64(input, output, isSaturating, oolEntry,
++                                       oolRejoin, InvalidFloatReg);
++    } else {
++      masm.wasmTruncateFloat32ToInt64(input, output, isSaturating, oolEntry,
++                                      oolRejoin, InvalidFloatReg);
++    }
++  }
++}
++
++void CodeGenerator::visitInt64ToFloatingPoint(LInt64ToFloatingPoint* lir) {
++  Register64 input = ToRegister64(lir->input());
++  FloatRegister output = ToFloatRegister(lir->output());
++  MIRType outputType = lir->mir()->type();
++
++  if (outputType == MIRType::Double) {
++    if (lir->mir()->isUnsigned()) {
++      masm.convertUInt64ToDouble(input, output, Register::Invalid());
++    } else {
++      masm.convertInt64ToDouble(input, output);
++    }
++  } else {
++    if (lir->mir()->isUnsigned()) {
++      masm.convertUInt64ToFloat32(input, output, Register::Invalid());
++    } else {
++      masm.convertInt64ToFloat32(input, output);
++    }
++  }
++}
++
++void CodeGenerator::visitWasmUint32ToDouble(LWasmUint32ToDouble* lir) {
++  Register input = ToRegister(lir->input());
++  FloatRegister output = ToFloatRegister(lir->output());
++  masm.convertUInt32ToDouble(input, output);
++}
++
++void CodeGenerator::visitWasmUint32ToFloat32(LWasmUint32ToFloat32* lir) {
++  Register input = ToRegister(lir->input());
++  FloatRegister output = ToFloatRegister(lir->output());
++  masm.convertUInt32ToFloat32(input, output);
++}
++
++void CodeGenerator::visitWasmBuiltinTruncateDToInt32(
++    LWasmBuiltinTruncateDToInt32* lir) {
++  emitTruncateDouble(ToFloatRegister(lir->getOperand(0)),
++                     ToRegister(lir->getDef(0)), lir->mir());
++}
++
++void CodeGenerator::visitWasmBuiltinTruncateFToInt32(
++    LWasmBuiltinTruncateFToInt32* lir) {
++  emitTruncateFloat32(ToFloatRegister(lir->getOperand(0)),
++                      ToRegister(lir->getDef(0)), lir->mir());
++}
++
++// ===============================================================
++// Visitors: Wasm load/store
++
++template <typename T>
++void CodeGeneratorPPC64::emitWasmLoad(T* lir) {
++  const MWasmLoad* mir = lir->mir();
++  UseScratchRegisterScope temps(masm);
++  Register scratch = temps.Acquire();
++
++  Register memoryBase = ToRegister(lir->memoryBase());
++  Register ptr = ToRegister(lir->ptr());
++  Register ptrScratch = ToTempRegisterOrInvalid(lir->temp0());
++
++  if (mir->base()->type() == MIRType::Int32) {
++    masm.move32To64ZeroExtend(ptr, Register64(scratch));
++    ptr = scratch;
++    ptrScratch = ptrScratch != InvalidReg ? scratch : InvalidReg;
++  }
++
++  masm.wasmLoad(mir->access(), memoryBase, ptr, ptrScratch,
++                ToAnyRegister(lir->output()));
++}
++
++template <typename T>
++void CodeGeneratorPPC64::emitWasmStore(T* lir) {
++  const MWasmStore* mir = lir->mir();
++  UseScratchRegisterScope temps(masm);
++  Register scratch = temps.Acquire();
++
++  Register memoryBase = ToRegister(lir->memoryBase());
++  Register ptr = ToRegister(lir->ptr());
++  Register ptrScratch = ToTempRegisterOrInvalid(lir->temp0());
++
++  if (mir->base()->type() == MIRType::Int32) {
++    masm.move32To64ZeroExtend(ptr, Register64(scratch));
++    ptr = scratch;
++    ptrScratch = ptrScratch != InvalidReg ? scratch : InvalidReg;
++  }
++
++  masm.wasmStore(mir->access(), ToAnyRegister(lir->value()), memoryBase, ptr,
++                 ptrScratch);
++}
++
++void CodeGenerator::visitWasmLoad(LWasmLoad* lir) { emitWasmLoad(lir); }
++
++void CodeGenerator::visitWasmStore(LWasmStore* lir) { emitWasmStore(lir); }
++
++void CodeGenerator::visitWasmLoadI64(LWasmLoadI64* lir) {
++  const MWasmLoad* mir = lir->mir();
++
++  Register memoryBase = ToRegister(lir->memoryBase());
++  Register ptrScratch = ToTempRegisterOrInvalid(lir->temp0());
++
++  Register ptrReg = ToRegister(lir->ptr());
++  if (mir->base()->type() == MIRType::Int32) {
++    masm.move32ZeroExtendToPtr(ptrReg, ptrReg);
++  }
++
++  masm.wasmLoadI64(mir->access(), memoryBase, ptrReg, ptrScratch,
++                   ToOutRegister64(lir));
++}
++
++void CodeGenerator::visitWasmStoreI64(LWasmStoreI64* lir) {
++  const MWasmStore* mir = lir->mir();
++
++  Register memoryBase = ToRegister(lir->memoryBase());
++  Register ptrScratch = ToTempRegisterOrInvalid(lir->temp0());
++
++  Register ptrReg = ToRegister(lir->ptr());
++  if (mir->base()->type() == MIRType::Int32) {
++    masm.move32ZeroExtendToPtr(ptrReg, ptrReg);
++  }
++
++  masm.wasmStoreI64(mir->access(), ToRegister64(lir->value()), memoryBase,
++                    ptrReg, ptrScratch);
++}
++
++void CodeGenerator::visitAsmJSLoadHeap(LAsmJSLoadHeap* ins) {
++  const MAsmJSLoadHeap* mir = ins->mir();
++  MOZ_ASSERT(!mir->hasMemoryBase());
++
++  const LAllocation* ptr = ins->ptr();
++  const LDefinition* output = ins->output();
++  const LAllocation* boundsCheckLimit = ins->boundsCheckLimit();
++
++  Register ptrReg = ToRegister(ptr);
++  Scalar::Type accessType = mir->accessType();
++  bool isFloat = accessType == Scalar::Float32 || accessType == Scalar::Float64;
++  Label done;
++
++  if (mir->needsBoundsCheck()) {
++    Label boundsCheckPassed;
++    Register boundsCheckLimitReg = ToRegister(boundsCheckLimit);
++    masm.wasmBoundsCheck32(Assembler::Below, ptrReg, boundsCheckLimitReg,
++                           &boundsCheckPassed);
++    if (isFloat) {
++      if (accessType == Scalar::Float32) {
++        masm.loadConstantFloat32(GenericNaN(), ToFloatRegister(output));
++      } else {
++        masm.loadConstantDouble(GenericNaN(), ToFloatRegister(output));
++      }
++    } else {
++      masm.movePtr(ImmWord(0), ToRegister(output));
++    }
++    masm.jump(&done);
++    masm.bind(&boundsCheckPassed);
++  }
++
++  UseScratchRegisterScope temps(masm);
++  Register scratch = temps.Acquire();
++  masm.move32To64ZeroExtend(ptrReg, Register64(scratch));
++
++  switch (accessType) {
++    case Scalar::Int8:
++      masm.as_lbzx(ToRegister(output), HeapReg, scratch);
++      masm.as_extsb(ToRegister(output), ToRegister(output));
++      break;
++    case Scalar::Uint8:
++      masm.as_lbzx(ToRegister(output), HeapReg, scratch);
++      break;
++    case Scalar::Int16:
++      masm.as_lhax(ToRegister(output), HeapReg, scratch);
++      break;
++    case Scalar::Uint16:
++      masm.as_lhzx(ToRegister(output), HeapReg, scratch);
++      break;
++    case Scalar::Int32:
++      masm.as_lwzx(ToRegister(output), HeapReg, scratch);
++      masm.as_extsw(ToRegister(output), ToRegister(output));
++      break;
++    case Scalar::Uint32:
++      masm.as_lwzx(ToRegister(output), HeapReg, scratch);
++      break;
++    case Scalar::Float64:
++      masm.as_lfdx(ToFloatRegister(output), HeapReg, scratch);
++      break;
++    case Scalar::Float32:
++      masm.as_lfsx(ToFloatRegister(output), HeapReg, scratch);
++      break;
++    default:
++      MOZ_CRASH("unexpected array type");
++  }
++
++  if (done.used()) {
++    masm.bind(&done);
++  }
++}
++
++void CodeGenerator::visitAsmJSStoreHeap(LAsmJSStoreHeap* ins) {
++  const MAsmJSStoreHeap* mir = ins->mir();
++  MOZ_ASSERT(!mir->hasMemoryBase());
++
++  const LAllocation* value = ins->value();
++  const LAllocation* ptr = ins->ptr();
++  const LAllocation* boundsCheckLimit = ins->boundsCheckLimit();
++
++  Register ptrReg = ToRegister(ptr);
++
++  Label done;
++  if (mir->needsBoundsCheck()) {
++    Register boundsCheckLimitReg = ToRegister(boundsCheckLimit);
++    masm.wasmBoundsCheck32(Assembler::AboveOrEqual, ptrReg, boundsCheckLimitReg,
++                           &done);
++  }
++
++  UseScratchRegisterScope temps(masm);
++  Register scratch = temps.Acquire();
++  masm.move32To64ZeroExtend(ptrReg, Register64(scratch));
++
++  switch (mir->accessType()) {
++    case Scalar::Int8:
++    case Scalar::Uint8:
++      masm.as_stbx(ToRegister(value), HeapReg, scratch);
++      break;
++    case Scalar::Int16:
++    case Scalar::Uint16:
++      masm.as_sthx(ToRegister(value), HeapReg, scratch);
++      break;
++    case Scalar::Int32:
++    case Scalar::Uint32:
++      masm.as_stwx(ToRegister(value), HeapReg, scratch);
++      break;
++    case Scalar::Float64:
++      masm.as_stfdx(ToFloatRegister(value), HeapReg, scratch);
++      break;
++    case Scalar::Float32:
++      masm.as_stfsx(ToFloatRegister(value), HeapReg, scratch);
++      break;
++    default:
++      MOZ_CRASH("unexpected array type");
++  }
++
++  if (done.used()) {
++    masm.bind(&done);
++  }
++}
++
++void CodeGenerator::visitWasmStackArg(LWasmStackArg* ins) {
++  const MWasmStackArg* mir = ins->mir();
++  if (ins->arg()->isConstant()) {
++    masm.storePtr(ImmWord(ToInt32(ins->arg())),
++                  Address(StackPointer, mir->spOffset()));
++  } else {
++    if (ins->arg()->isGeneralReg()) {
++      masm.storePtr(ToRegister(ins->arg()),
++                    Address(StackPointer, mir->spOffset()));
++    } else if (mir->input()->type() == MIRType::Double) {
++      masm.storeDouble(ToFloatRegister(ins->arg()),
++                       Address(StackPointer, mir->spOffset()));
++#ifdef ENABLE_WASM_SIMD
++    } else if (mir->input()->type() == MIRType::Simd128) {
++      masm.storeUnalignedSimd128(ToFloatRegister(ins->arg()),
++                                 Address(StackPointer, mir->spOffset()));
++#endif
++    } else {
++      masm.storeFloat32(ToFloatRegister(ins->arg()),
++                        Address(StackPointer, mir->spOffset()));
++    }
++  }
++}
++
++void CodeGenerator::visitWasmStackArgI64(LWasmStackArgI64* ins) {
++  const MWasmStackArg* mir = ins->mir();
++  Address dst(StackPointer, mir->spOffset());
++  if (IsConstant(ins->arg())) {
++    masm.store64(Imm64(ToInt64(ins->arg())), dst);
++  } else {
++    masm.store64(ToRegister64(ins->arg()), dst);
++  }
++}
++
++void CodeGenerator::visitWasmSelect(LWasmSelect* ins) {
++  MIRType mirType = ins->mir()->type();
++
++  Register cond = ToRegister(ins->condExpr());
++  const LAllocation* falseExpr = ins->falseExpr();
++
++  if (mirType == MIRType::Int32 || mirType == MIRType::WasmAnyRef) {
++    Register out = ToRegister(ins->output());
++    MOZ_ASSERT(ToRegister(ins->trueExpr()) == out,
++               "true expr input is reused for output");
++    if (falseExpr->isGeneralReg()) {
++      masm.moveIfZero(out, ToRegister(falseExpr), cond);
++    } else {
++      masm.cmp32Load32(Assembler::Zero, cond, cond, ToAddress(falseExpr), out);
++    }
++    return;
++  }
++
++  FloatRegister out = ToFloatRegister(ins->output());
++  MOZ_ASSERT(ToFloatRegister(ins->trueExpr()) == out,
++             "true expr input is reused for output");
++
++  if (falseExpr->isFloatReg()) {
++    Label done;
++    // The select condition is a 32-bit value; test 32 bits so high-bit garbage
++    // does not make a zero condition read as non-zero.
++    masm.branchTest32(Assembler::NonZero, cond, cond, &done);
++    if (mirType == MIRType::Float32) {
++      masm.moveFloat32(ToFloatRegister(falseExpr), out);
++    } else if (mirType == MIRType::Double) {
++      masm.moveDouble(ToFloatRegister(falseExpr), out);
++    } else if (mirType == MIRType::Simd128) {
++      masm.moveSimd128(ToFloatRegister(falseExpr), out);
++    } else {
++      MOZ_CRASH("unhandled type in visitWasmSelect!");
++    }
++    masm.bind(&done);
++  } else {
++    Label done;
++    // The select condition is a 32-bit value; test 32 bits so high-bit garbage
++    // does not make a zero condition read as non-zero.
++    masm.branchTest32(Assembler::NonZero, cond, cond, &done);
++
++    if (mirType == MIRType::Float32) {
++      masm.loadFloat32(ToAddress(falseExpr), out);
++    } else if (mirType == MIRType::Double) {
++      masm.loadDouble(ToAddress(falseExpr), out);
++    } else if (mirType == MIRType::Simd128) {
++      masm.loadUnalignedSimd128(ToAddress(falseExpr), out);
++    } else {
++      MOZ_CRASH("unhandled type in visitWasmSelect!");
++    }
++
++    masm.bind(&done);
++  }
++}
++
++void CodeGenerator::visitWasmSelectI64(LWasmSelectI64* lir) {
++  MOZ_ASSERT(lir->mir()->type() == MIRType::Int64);
++
++  Register cond = ToRegister(lir->condExpr());
++  LInt64Allocation falseExpr = lir->falseExpr();
++
++  Register64 out = ToOutRegister64(lir);
++  MOZ_ASSERT(ToRegister64(lir->trueExpr()) == out,
++             "true expr is reused for input");
++
++  if (falseExpr.value().isGeneralReg()) {
++    masm.moveIfZero(out.reg, ToRegister(falseExpr.value()), cond);
++  } else {
++    Label done;
++    // The select condition is a 32-bit value; test 32 bits so high-bit garbage
++    // does not make a zero condition read as non-zero.
++    masm.branchTest32(Assembler::NonZero, cond, cond, &done);
++    masm.loadPtr(ToAddress(falseExpr.value()), out.reg);
++    masm.bind(&done);
++  }
++}
++
++void CodeGenerator::visitWasmCompareAndSelect(LWasmCompareAndSelect* ins) {
++  MCompare::CompareType compTy = ins->compareType();
++  MIRType insTy = ins->mir()->type();
++  const bool cmpIs32 = compTy == MCompare::Compare_Int32 ||
++                       compTy == MCompare::Compare_UInt32;
++  const bool cmpIs64 = compTy == MCompare::Compare_Int64 ||
++                       compTy == MCompare::Compare_UInt64;
++  const bool selIsInt = insTy == MIRType::Int32 || insTy == MIRType::Int64;
++
++  MOZ_RELEASE_ASSERT(
++      (cmpIs32 || cmpIs64) && selIsInt,
++      "CodeGenerator::visitWasmCompareAndSelect: unexpected types");
++
++  Register trueExprAndDest = ToRegister(ins->output());
++  MOZ_ASSERT(ToRegister(ins->ifTrueExpr()) == trueExprAndDest,
++             "true expr input is reused for output");
++
++  Assembler::Condition cond =
++      Assembler::InvertCondition(JSOpToCondition(compTy, ins->jsop()));
++  Register lhs = ToRegister(ins->leftExpr());
++  Register rhs = ToRegister(ins->rightExpr());
++  Register falseExpr = ToRegister(ins->ifFalseExpr());
++
++  // isel operates on the whole 64-bit GPR regardless of compare width; only
++  // the compare instruction differs (cmpw/cmplw vs cmpd/cmpld).
++  if (cmpIs32) {
++    masm.cmp32Move32(cond, lhs, rhs, falseExpr, trueExprAndDest);
++  } else {
++    masm.cmpPtrMovePtr(cond, lhs, rhs, falseExpr, trueExprAndDest);
++  }
++}
++
++void CodeGenerator::visitWasmAddOffset(LWasmAddOffset* lir) {
++  MWasmAddOffset* mir = lir->mir();
++  Register base = ToRegister(lir->base());
++  Register out = ToRegister(lir->output());
++
++  Label ok;
++  masm.ma_add32TestCarry(Assembler::CarryClear, out, base, Imm32(mir->offset()),
++                         &ok);
++  masm.wasmTrap(wasm::Trap::OutOfBounds, mir->trapSiteDesc());
++  masm.bind(&ok);
++}
++
++void CodeGenerator::visitWasmAddOffset64(LWasmAddOffset64* lir) {
++  MWasmAddOffset* mir = lir->mir();
++  Register64 base = ToRegister64(lir->base());
++  Register64 out = ToOutRegister64(lir);
++
++  Label ok;
++  masm.ma_addPtrTestCarry(Assembler::CarryClear, out.reg, base.reg,
++                          ImmWord(mir->offset()), &ok);
++  masm.wasmTrap(wasm::Trap::OutOfBounds, mir->trapSiteDesc());
++  masm.bind(&ok);
++}
++
++// ===============================================================
++// Visitors: Effective Address
++
++void CodeGenerator::visitEffectiveAddress2(LEffectiveAddress2* ins) {
++  const MEffectiveAddress2* mir = ins->mir();
++  Register output = ToRegister(ins->output());
++
++  // EA = index * scale + displacement (no base register)
++  masm.movePtr(ImmWord(0), output);
++  BaseIndex addr(output, ToRegister(ins->index()), mir->scale(),
++                 mir->displacement());
++  masm.computeEffectiveAddress(addr, output);
++  // Sign-extend to 32-bit
++  masm.as_extsw(output, output);
++}
++
++void CodeGenerator::visitEffectiveAddress3(LEffectiveAddress3* ins) {
++  const MEffectiveAddress3* mir = ins->mir();
++  Register output = ToRegister(ins->output());
++
++  BaseIndex addr(ToRegister(ins->base()), ToRegister(ins->index()),
++                 mir->scale(), mir->displacement());
++  masm.computeEffectiveAddress(addr, output);
++  // Sign-extend to 32-bit
++  masm.as_extsw(output, output);
++}
++
++void CodeGenerator::visitWasmMulI64WideHI64(LWasmMulI64WideHI64* ins) {
++  Register lhs = ToRegister(ins->lhs());
++  Register rhs = ToRegister(ins->rhs());
++  Register output = ToRegister(ins->output());
++
++  if (ins->isSigned()) {
++    masm.as_mulhd(output, lhs, rhs);
++  } else {
++    masm.as_mulhdu(output, lhs, rhs);
++  }
++}
++
++// ===============================================================
++// Visitors: Typed Array Atomics
++
++void CodeGenerator::visitCompareExchangeTypedArrayElement(
++    LCompareExchangeTypedArrayElement* lir) {
++  Register elements = ToRegister(lir->elements());
++  AnyRegister output = ToAnyRegister(lir->output());
++  Register outTemp = ToTempRegisterOrInvalid(lir->temp0());
++
++  Register oldval = ToRegister(lir->oldval());
++  Register newval = ToRegister(lir->newval());
++  Register valueTemp = ToTempRegisterOrInvalid(lir->temp1());
++  Register offsetTemp = ToTempRegisterOrInvalid(lir->temp2());
++  Register maskTemp = ToTempRegisterOrInvalid(lir->temp3());
++  Scalar::Type arrayType = lir->mir()->arrayType();
++
++  auto dest = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
++
++  dest.match([&](const auto& dest) {
++    masm.compareExchangeJS(arrayType, Synchronization::Full(), dest, oldval,
++                           newval, valueTemp, offsetTemp, maskTemp, outTemp,
++                           output);
++  });
++}
++
++void CodeGenerator::visitAtomicExchangeTypedArrayElement(
++    LAtomicExchangeTypedArrayElement* lir) {
++  Register elements = ToRegister(lir->elements());
++  AnyRegister output = ToAnyRegister(lir->output());
++  Register outTemp = ToTempRegisterOrInvalid(lir->temp0());
++
++  Register value = ToRegister(lir->value());
++  Register valueTemp = ToTempRegisterOrInvalid(lir->temp1());
++  Register offsetTemp = ToTempRegisterOrInvalid(lir->temp2());
++  Register maskTemp = ToTempRegisterOrInvalid(lir->temp3());
++  Scalar::Type arrayType = lir->mir()->arrayType();
++
++  auto dest = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
++
++  dest.match([&](const auto& dest) {
++    masm.atomicExchangeJS(arrayType, Synchronization::Full(), dest, value,
++                          valueTemp, offsetTemp, maskTemp, outTemp, output);
++  });
++}
++
++void CodeGenerator::visitAtomicTypedArrayElementBinop(
++    LAtomicTypedArrayElementBinop* lir) {
++  MOZ_ASSERT(!lir->mir()->isForEffect());
++
++  AnyRegister output = ToAnyRegister(lir->output());
++  Register elements = ToRegister(lir->elements());
++  Register outTemp = ToTempRegisterOrInvalid(lir->temp0());
++  Register valueTemp = ToTempRegisterOrInvalid(lir->temp1());
++  Register offsetTemp = ToTempRegisterOrInvalid(lir->temp2());
++  Register maskTemp = ToTempRegisterOrInvalid(lir->temp3());
++  Register value = ToRegister(lir->value());
++  Scalar::Type arrayType = lir->mir()->arrayType();
++
++  auto mem = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
++
++  mem.match([&](const auto& mem) {
++    masm.atomicFetchOpJS(arrayType, Synchronization::Full(),
++                         lir->mir()->operation(), value, mem, valueTemp,
++                         offsetTemp, maskTemp, outTemp, output);
++  });
++}
++
++void CodeGenerator::visitAtomicTypedArrayElementBinopForEffect(
++    LAtomicTypedArrayElementBinopForEffect* lir) {
++  MOZ_ASSERT(lir->mir()->isForEffect());
++
++  Register elements = ToRegister(lir->elements());
++  Register valueTemp = ToTempRegisterOrInvalid(lir->temp0());
++  Register offsetTemp = ToTempRegisterOrInvalid(lir->temp1());
++  Register maskTemp = ToTempRegisterOrInvalid(lir->temp2());
++  Register value = ToRegister(lir->value());
++  Scalar::Type arrayType = lir->mir()->arrayType();
++
++  auto mem = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
++
++  mem.match([&](const auto& mem) {
++    masm.atomicEffectOpJS(arrayType, Synchronization::Full(),
++                          lir->mir()->operation(), value, mem, valueTemp,
++                          offsetTemp, maskTemp);
++  });
++}
++
++void CodeGenerator::visitCompareExchangeTypedArrayElement64(
++    LCompareExchangeTypedArrayElement64* lir) {
++  Register elements = ToRegister(lir->elements());
++  Register64 oldval = ToRegister64(lir->oldval());
++  Register64 newval = ToRegister64(lir->newval());
++  Register64 out = ToOutRegister64(lir);
++  Scalar::Type arrayType = lir->mir()->arrayType();
++
++  auto dest = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
++
++  dest.match([&](const auto& dest) {
++    masm.compareExchange64(Synchronization::Full(), dest, oldval, newval, out);
++  });
++}
++
++void CodeGenerator::visitAtomicExchangeTypedArrayElement64(
++    LAtomicExchangeTypedArrayElement64* lir) {
++  Register elements = ToRegister(lir->elements());
++  Register64 value = ToRegister64(lir->value());
++  Register64 out = ToOutRegister64(lir);
++  Scalar::Type arrayType = lir->mir()->arrayType();
++
++  auto dest = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
++
++  dest.match([&](const auto& dest) {
++    masm.atomicExchange64(Synchronization::Full(), dest, value, out);
++  });
++}
++
++void CodeGenerator::visitAtomicTypedArrayElementBinop64(
++    LAtomicTypedArrayElementBinop64* lir) {
++  MOZ_ASSERT(lir->mir()->hasUses());
++
++  Register elements = ToRegister(lir->elements());
++  Register64 value = ToRegister64(lir->value());
++  Register64 temp = ToRegister64(lir->temp0());
++  Register64 out = ToOutRegister64(lir);
++
++  Scalar::Type arrayType = lir->mir()->arrayType();
++  AtomicOp atomicOp = lir->mir()->operation();
++
++  auto dest = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
++
++  dest.match([&](const auto& dest) {
++    masm.atomicFetchOp64(Synchronization::Full(), atomicOp, value, dest, temp,
++                         out);
++  });
++}
++
++void CodeGenerator::visitAtomicTypedArrayElementBinopForEffect64(
++    LAtomicTypedArrayElementBinopForEffect64* lir) {
++  MOZ_ASSERT(!lir->mir()->hasUses());
++
++  Register elements = ToRegister(lir->elements());
++  Register64 value = ToRegister64(lir->value());
++  Register64 temp = ToRegister64(lir->temp0());
++
++  Scalar::Type arrayType = lir->mir()->arrayType();
++  AtomicOp atomicOp = lir->mir()->operation();
++
++  auto dest = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
++
++  dest.match([&](const auto& dest) {
++    masm.atomicEffectOp64(Synchronization::Full(), atomicOp, value, dest, temp);
++  });
++}
++
++void CodeGenerator::visitAtomicLoad64(LAtomicLoad64* lir) {
++  Register elements = ToRegister(lir->elements());
++  Register64 out = ToOutRegister64(lir);
++  Scalar::Type storageType = lir->mir()->storageType();
++
++  auto source = ToAddressOrBaseIndex(elements, lir->index(), storageType);
++
++  auto sync = Synchronization::Load();
++  masm.memoryBarrierBefore(sync);
++  source.match([&](const auto& source) { masm.load64(source, out); });
++  masm.memoryBarrierAfter(sync);
++}
++
++void CodeGenerator::visitAtomicStore64(LAtomicStore64* lir) {
++  Register elements = ToRegister(lir->elements());
++  Register64 value = ToRegister64(lir->value());
++  Scalar::Type writeType = lir->mir()->writeType();
++
++  auto dest = ToAddressOrBaseIndex(elements, lir->index(), writeType);
++
++  auto sync = Synchronization::Store();
++  masm.memoryBarrierBefore(sync);
++  dest.match([&](const auto& dest) { masm.store64(value, dest); });
++  masm.memoryBarrierAfter(sync);
++}
++
++// Wasm Atomics
++void CodeGenerator::visitWasmCompareExchangeHeap(
++    LWasmCompareExchangeHeap* ins) {
++  MWasmCompareExchangeHeap* mir = ins->mir();
++  Register memoryBase = ToRegister(ins->memoryBase());
++  Register ptrReg = ToRegister(ins->ptr());
++  BaseIndex srcAddr(memoryBase, ptrReg, TimesOne, mir->access().offset32());
++
++  Register oldval = ToRegister(ins->oldValue());
++  Register newval = ToRegister(ins->newValue());
++  Register valueTemp = ToTempRegisterOrInvalid(ins->temp0());
++  Register offsetTemp = ToTempRegisterOrInvalid(ins->temp1());
++  Register maskTemp = ToTempRegisterOrInvalid(ins->temp2());
++
++  masm.wasmCompareExchange(mir->access(), srcAddr, oldval, newval, valueTemp,
++                           offsetTemp, maskTemp, ToRegister(ins->output()));
++}
++
++void CodeGenerator::visitWasmAtomicExchangeHeap(LWasmAtomicExchangeHeap* ins) {
++  MWasmAtomicExchangeHeap* mir = ins->mir();
++  Register memoryBase = ToRegister(ins->memoryBase());
++  Register ptrReg = ToRegister(ins->ptr());
++  Register value = ToRegister(ins->value());
++  BaseIndex srcAddr(memoryBase, ptrReg, TimesOne, mir->access().offset32());
++
++  Register valueTemp = ToTempRegisterOrInvalid(ins->temp0());
++  Register offsetTemp = ToTempRegisterOrInvalid(ins->temp1());
++  Register maskTemp = ToTempRegisterOrInvalid(ins->temp2());
++
++  masm.wasmAtomicExchange(mir->access(), srcAddr, value, valueTemp, offsetTemp,
++                          maskTemp, ToRegister(ins->output()));
++}
++
++void CodeGenerator::visitWasmAtomicBinopHeap(LWasmAtomicBinopHeap* ins) {
++  MOZ_ASSERT(ins->mir()->hasUses());
++
++  MWasmAtomicBinopHeap* mir = ins->mir();
++  Register memoryBase = ToRegister(ins->memoryBase());
++  Register ptrReg = ToRegister(ins->ptr());
++  Register valueTemp = ToTempRegisterOrInvalid(ins->temp0());
++  Register offsetTemp = ToTempRegisterOrInvalid(ins->temp1());
++  Register maskTemp = ToTempRegisterOrInvalid(ins->temp2());
++
++  BaseIndex srcAddr(memoryBase, ptrReg, TimesOne, mir->access().offset32());
++
++  masm.wasmAtomicFetchOp(mir->access(), mir->operation(),
++                         ToRegister(ins->value()), srcAddr, valueTemp,
++                         offsetTemp, maskTemp, ToRegister(ins->output()));
++}
++
++void CodeGenerator::visitWasmAtomicBinopHeapForEffect(
++    LWasmAtomicBinopHeapForEffect* ins) {
++  MOZ_ASSERT(!ins->mir()->hasUses());
++
++  MWasmAtomicBinopHeap* mir = ins->mir();
++  Register memoryBase = ToRegister(ins->memoryBase());
++  Register ptrReg = ToRegister(ins->ptr());
++  Register valueTemp = ToTempRegisterOrInvalid(ins->temp0());
++  Register offsetTemp = ToTempRegisterOrInvalid(ins->temp1());
++  Register maskTemp = ToTempRegisterOrInvalid(ins->temp2());
++
++  BaseIndex srcAddr(memoryBase, ptrReg, TimesOne, mir->access().offset32());
++  masm.wasmAtomicEffectOp(mir->access(), mir->operation(),
++                          ToRegister(ins->value()), srcAddr, valueTemp,
++                          offsetTemp, maskTemp);
++}
++
++void CodeGenerator::visitWasmCompareExchangeI64(LWasmCompareExchangeI64* lir) {
++  Register memoryBase = ToRegister(lir->memoryBase());
++  Register ptr = ToRegister(lir->ptr());
++  Register64 oldValue = ToRegister64(lir->oldValue());
++  Register64 newValue = ToRegister64(lir->newValue());
++  Register64 output = ToOutRegister64(lir);
++  uint32_t offset = lir->mir()->access().offset32();
++
++  BaseIndex addr(memoryBase, ptr, TimesOne, offset);
++  masm.wasmCompareExchange64(lir->mir()->access(), addr, oldValue, newValue,
++                             output);
++}
++
++void CodeGenerator::visitWasmAtomicExchangeI64(LWasmAtomicExchangeI64* lir) {
++  Register memoryBase = ToRegister(lir->memoryBase());
++  Register ptr = ToRegister(lir->ptr());
++  Register64 value = ToRegister64(lir->value());
++  Register64 output = ToOutRegister64(lir);
++  uint32_t offset = lir->mir()->access().offset32();
++
++  BaseIndex addr(memoryBase, ptr, TimesOne, offset);
++  masm.wasmAtomicExchange64(lir->mir()->access(), addr, value, output);
++}
++
++void CodeGenerator::visitWasmAtomicBinopI64(LWasmAtomicBinopI64* lir) {
++  Register memoryBase = ToRegister(lir->memoryBase());
++  Register ptr = ToRegister(lir->ptr());
++  Register64 value = ToRegister64(lir->value());
++  Register64 output = ToOutRegister64(lir);
++  Register64 temp = ToRegister64(lir->temp0());
++  uint32_t offset = lir->mir()->access().offset32();
++
++  BaseIndex addr(memoryBase, ptr, TimesOne, offset);
++
++  masm.wasmAtomicFetchOp64(lir->mir()->access(), lir->mir()->operation(), value,
++                           addr, temp, output);
++}
++
++// SIMD code generators.
++void CodeGenerator::visitSimd128(LSimd128* ins) {
++  FloatRegister dest = ToFloatRegister(ins->output());
++  masm.loadConstantSimd128(ins->simd128(), dest);
++}
++void CodeGenerator::visitWasmTernarySimd128(LWasmTernarySimd128* ins) {
++  FloatRegister v0 = ToFloatRegister(ins->v0());
++  FloatRegister v1 = ToFloatRegister(ins->v1());
++  FloatRegister v2 = ToFloatRegister(ins->v2());
++  FloatRegister dest = ToFloatRegister(ins->output());
++  switch (ins->simdOp()) {
++    case wasm::SimdOp::V128Bitselect:
++      // bitselect(v0, v1, v2): result = (v0 & v2) | (v1 & ~v2)
++      // xxsel: XC=0→XA, XC=1→XB → (XA & ~XC) | (XB & XC)
++      // Need XA=v1, XB=v0, XC=v2.
++      masm.as_xxsel(dest, v1, v0, v2);
++      break;
++    case wasm::SimdOp::I8x16RelaxedLaneSelect:
++    case wasm::SimdOp::I16x8RelaxedLaneSelect:
++    case wasm::SimdOp::I32x4RelaxedLaneSelect:
++    case wasm::SimdOp::I64x2RelaxedLaneSelect:
++      // relaxed laneSelect(v0, v1, mask=v2): same as bitselect
++      masm.as_xxsel(dest, v1, v0, v2);
++      break;
++    // Lowering uses defineReuseInput on V2Index for ternary ops — the
++    // allocator is required to place `dest` in v2's slot. Assert that
++    // here; the FMA/dot helpers write their result through v2 in-place,
++    // so dest == v2 makes the trailing moveSimd128 unnecessary.
++    case wasm::SimdOp::I32x4RelaxedDotI8x16I7x16AddS:
++      MOZ_ASSERT(dest == v2);
++      masm.dotInt8x16Int7x16ThenAdd(v0, v1, v2,
++                                    ToFloatRegister(ins->temp0()));
++      break;
++    case wasm::SimdOp::F32x4RelaxedMadd:
++      MOZ_ASSERT(dest == v2);
++      masm.fmaFloat32x4(v0, v1, v2);
++      break;
++    case wasm::SimdOp::F64x2RelaxedMadd:
++      MOZ_ASSERT(dest == v2);
++      masm.fmaFloat64x2(v0, v1, v2);
++      break;
++    case wasm::SimdOp::F32x4RelaxedNmadd:
++      MOZ_ASSERT(dest == v2);
++      masm.fnmaFloat32x4(v0, v1, v2);
++      break;
++    case wasm::SimdOp::F64x2RelaxedNmadd:
++      MOZ_ASSERT(dest == v2);
++      masm.fnmaFloat64x2(v0, v1, v2);
++      break;
++    default:
++      MOZ_CRASH("PPC64: NYI SIMD ternary op");
++  }
++}
++void CodeGenerator::visitWasmBinarySimd128(LWasmBinarySimd128* ins) {
++  FloatRegister lhs = ToFloatRegister(ins->lhs());
++  FloatRegister rhs = ToFloatRegister(ins->rhs());
++  FloatRegister dest = ToFloatRegister(ins->output());
++  switch (ins->simdOp()) {
++    // Bitwise
++    case wasm::SimdOp::V128And:
++      masm.bitwiseAndSimd128(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::V128Or:
++      masm.bitwiseOrSimd128(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::V128Xor:
++      masm.bitwiseXorSimd128(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::V128AndNot:
++      masm.bitwiseAndNotSimd128(lhs, rhs, dest);
++      break;
++    // Integer add
++    case wasm::SimdOp::I8x16Add:
++      masm.addInt8x16(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8Add:
++      masm.addInt16x8(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4Add:
++      masm.addInt32x4(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I64x2Add:
++      masm.addInt64x2(lhs, rhs, dest);
++      break;
++    // Integer sub
++    case wasm::SimdOp::I8x16Sub:
++      masm.subInt8x16(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8Sub:
++      masm.subInt16x8(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4Sub:
++      masm.subInt32x4(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I64x2Sub:
++      masm.subInt64x2(lhs, rhs, dest);
++      break;
++    // Saturating add
++    case wasm::SimdOp::I8x16AddSatS:
++      masm.addSatInt8x16(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I8x16AddSatU:
++      masm.unsignedAddSatInt8x16(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8AddSatS:
++      masm.addSatInt16x8(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8AddSatU:
++      masm.unsignedAddSatInt16x8(lhs, rhs, dest);
++      break;
++    // Saturating sub
++    case wasm::SimdOp::I8x16SubSatS:
++      masm.subSatInt8x16(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I8x16SubSatU:
++      masm.unsignedSubSatInt8x16(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8SubSatS:
++      masm.subSatInt16x8(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8SubSatU:
++      masm.unsignedSubSatInt16x8(lhs, rhs, dest);
++      break;
++    // Integer multiply
++    case wasm::SimdOp::I16x8Mul:
++      masm.mulInt16x8(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4Mul:
++      masm.mulInt32x4(lhs, rhs, dest);
++      break;
++    // Integer min/max signed
++    case wasm::SimdOp::I8x16MinS:
++      masm.minInt8x16(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I8x16MaxS:
++      masm.maxInt8x16(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8MinS:
++      masm.minInt16x8(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8MaxS:
++      masm.maxInt16x8(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4MinS:
++      masm.minInt32x4(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4MaxS:
++      masm.maxInt32x4(lhs, rhs, dest);
++      break;
++    // Integer min/max unsigned
++    case wasm::SimdOp::I8x16MinU:
++      masm.unsignedMinInt8x16(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I8x16MaxU:
++      masm.unsignedMaxInt8x16(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8MinU:
++      masm.unsignedMinInt16x8(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8MaxU:
++      masm.unsignedMaxInt16x8(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4MinU:
++      masm.unsignedMinInt32x4(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4MaxU:
++      masm.unsignedMaxInt32x4(lhs, rhs, dest);
++      break;
++    // Average unsigned
++    case wasm::SimdOp::I8x16AvgrU:
++      masm.unsignedAverageInt8x16(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8AvgrU:
++      masm.unsignedAverageInt16x8(lhs, rhs, dest);
++      break;
++    // Q15 multiply
++    case wasm::SimdOp::I16x8Q15MulrSatS:
++      masm.q15MulrSatInt16x8(lhs, rhs, dest);
++      break;
++    // Integer compare
++    case wasm::SimdOp::I8x16Eq:
++      masm.compareInt8x16(Assembler::Equal, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I8x16Ne:
++      masm.compareInt8x16(Assembler::NotEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I8x16LtS:
++      masm.compareInt8x16(Assembler::LessThan, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I8x16GtS:
++      masm.compareInt8x16(Assembler::GreaterThan, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I8x16LeS:
++      masm.compareInt8x16(Assembler::LessThanOrEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I8x16GeS:
++      masm.compareInt8x16(Assembler::GreaterThanOrEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I8x16LtU:
++      masm.compareInt8x16(Assembler::Below, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I8x16GtU:
++      masm.compareInt8x16(Assembler::Above, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I8x16LeU:
++      masm.compareInt8x16(Assembler::BelowOrEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I8x16GeU:
++      masm.compareInt8x16(Assembler::AboveOrEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8Eq:
++      masm.compareInt16x8(Assembler::Equal, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8Ne:
++      masm.compareInt16x8(Assembler::NotEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8LtS:
++      masm.compareInt16x8(Assembler::LessThan, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8GtS:
++      masm.compareInt16x8(Assembler::GreaterThan, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8LeS:
++      masm.compareInt16x8(Assembler::LessThanOrEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8GeS:
++      masm.compareInt16x8(Assembler::GreaterThanOrEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8LtU:
++      masm.compareInt16x8(Assembler::Below, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8GtU:
++      masm.compareInt16x8(Assembler::Above, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8LeU:
++      masm.compareInt16x8(Assembler::BelowOrEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8GeU:
++      masm.compareInt16x8(Assembler::AboveOrEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4Eq:
++      masm.compareInt32x4(Assembler::Equal, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4Ne:
++      masm.compareInt32x4(Assembler::NotEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4LtS:
++      masm.compareInt32x4(Assembler::LessThan, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4GtS:
++      masm.compareInt32x4(Assembler::GreaterThan, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4LeS:
++      masm.compareInt32x4(Assembler::LessThanOrEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4GeS:
++      masm.compareInt32x4(Assembler::GreaterThanOrEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4LtU:
++      masm.compareInt32x4(Assembler::Below, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4GtU:
++      masm.compareInt32x4(Assembler::Above, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4LeU:
++      masm.compareInt32x4(Assembler::BelowOrEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4GeU:
++      masm.compareInt32x4(Assembler::AboveOrEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I64x2Eq:
++      masm.compareInt64x2(Assembler::Equal, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I64x2Ne:
++      masm.compareInt64x2(Assembler::NotEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I64x2LtS:
++      masm.compareInt64x2(Assembler::LessThan, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I64x2GtS:
++      masm.compareInt64x2(Assembler::GreaterThan, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I64x2LeS:
++      masm.compareInt64x2(Assembler::LessThanOrEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I64x2GeS:
++      masm.compareInt64x2(Assembler::GreaterThanOrEqual, lhs, rhs, dest);
++      break;
++    // Float compare
++    case wasm::SimdOp::F32x4Eq:
++      masm.compareFloat32x4(Assembler::Equal, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F32x4Ne:
++      masm.compareFloat32x4(Assembler::NotEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F32x4Lt:
++      masm.compareFloat32x4(Assembler::LessThan, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F32x4Gt:
++      masm.compareFloat32x4(Assembler::GreaterThan, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F32x4Le:
++      masm.compareFloat32x4(Assembler::LessThanOrEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F32x4Ge:
++      masm.compareFloat32x4(Assembler::GreaterThanOrEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F64x2Eq:
++      masm.compareFloat64x2(Assembler::Equal, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F64x2Ne:
++      masm.compareFloat64x2(Assembler::NotEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F64x2Lt:
++      masm.compareFloat64x2(Assembler::LessThan, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F64x2Gt:
++      masm.compareFloat64x2(Assembler::GreaterThan, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F64x2Le:
++      masm.compareFloat64x2(Assembler::LessThanOrEqual, lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F64x2Ge:
++      masm.compareFloat64x2(Assembler::GreaterThanOrEqual, lhs, rhs, dest);
++      break;
++    // Float arithmetic
++    case wasm::SimdOp::F32x4Add:
++      masm.addFloat32x4(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F32x4Sub:
++      masm.subFloat32x4(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F32x4Mul:
++      masm.mulFloat32x4(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F32x4Div:
++      masm.divFloat32x4(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F32x4Min:
++      masm.minFloat32x4(lhs, rhs, dest, ToFloatRegister(ins->getTemp(0)),
++                         ToFloatRegister(ins->getTemp(1)));
++      break;
++    case wasm::SimdOp::F32x4Max:
++      masm.maxFloat32x4(lhs, rhs, dest, ToFloatRegister(ins->getTemp(0)),
++                         ToFloatRegister(ins->getTemp(1)));
++      break;
++    case wasm::SimdOp::F32x4PMin:
++      masm.pseudoMinFloat32x4(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F32x4PMax:
++      masm.pseudoMaxFloat32x4(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F64x2Add:
++      masm.addFloat64x2(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F64x2Sub:
++      masm.subFloat64x2(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F64x2Mul:
++      masm.mulFloat64x2(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F64x2Div:
++      masm.divFloat64x2(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F64x2Min:
++      masm.minFloat64x2(lhs, rhs, dest, ToFloatRegister(ins->getTemp(0)),
++                         ToFloatRegister(ins->getTemp(1)));
++      break;
++    case wasm::SimdOp::F64x2Max:
++      masm.maxFloat64x2(lhs, rhs, dest, ToFloatRegister(ins->getTemp(0)),
++                         ToFloatRegister(ins->getTemp(1)));
++      break;
++    case wasm::SimdOp::F64x2PMin:
++      masm.pseudoMinFloat64x2(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::F64x2PMax:
++      masm.pseudoMaxFloat64x2(lhs, rhs, dest);
++      break;
++    // Narrow
++    case wasm::SimdOp::I8x16NarrowI16x8S:
++      masm.narrowInt16x8(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I8x16NarrowI16x8U:
++      masm.unsignedNarrowInt16x8(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8NarrowI32x4S:
++      masm.narrowInt32x4(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8NarrowI32x4U:
++      masm.unsignedNarrowInt32x4(lhs, rhs, dest);
++      break;
++    // i64 multiply
++    case wasm::SimdOp::I64x2Mul: {
++      FloatRegister temp0 = ToTempFloatRegisterOrInvalid(ins->temp0());
++      FloatRegister temp1f = ToTempFloatRegisterOrInvalid(ins->temp1());
++      masm.mulInt64x2(lhs, rhs, dest, temp0, temp1f);
++      break;
++    }
++    // Extended multiply
++    case wasm::SimdOp::I16x8ExtmulLowI8x16S:
++      masm.extMulLowInt8x16(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8ExtmulHighI8x16S:
++      masm.extMulHighInt8x16(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8ExtmulLowI8x16U:
++      masm.unsignedExtMulLowInt8x16(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8ExtmulHighI8x16U:
++      masm.unsignedExtMulHighInt8x16(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4ExtmulLowI16x8S:
++      masm.extMulLowInt16x8(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4ExtmulHighI16x8S:
++      masm.extMulHighInt16x8(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4ExtmulLowI16x8U:
++      masm.unsignedExtMulLowInt16x8(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4ExtmulHighI16x8U:
++      masm.unsignedExtMulHighInt16x8(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I64x2ExtmulLowI32x4S:
++      masm.extMulLowInt32x4(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I64x2ExtmulHighI32x4S:
++      masm.extMulHighInt32x4(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I64x2ExtmulLowI32x4U:
++      masm.unsignedExtMulLowInt32x4(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I64x2ExtmulHighI32x4U:
++      masm.unsignedExtMulHighInt32x4(lhs, rhs, dest);
++      break;
++    // Dot product
++    case wasm::SimdOp::I32x4DotI16x8S:
++      masm.widenDotInt16x8(lhs, rhs, dest);
++      break;
++    // Relaxed binary ops
++    case wasm::SimdOp::F32x4RelaxedMin:
++      masm.minFloat32x4Relaxed(rhs, lhs);
++      if (dest != lhs) masm.moveSimd128(lhs, dest);
++      break;
++    case wasm::SimdOp::F32x4RelaxedMax:
++      masm.maxFloat32x4Relaxed(rhs, lhs);
++      if (dest != lhs) masm.moveSimd128(lhs, dest);
++      break;
++    case wasm::SimdOp::F64x2RelaxedMin:
++      masm.minFloat64x2Relaxed(rhs, lhs);
++      if (dest != lhs) masm.moveSimd128(lhs, dest);
++      break;
++    case wasm::SimdOp::F64x2RelaxedMax:
++      masm.maxFloat64x2Relaxed(rhs, lhs);
++      if (dest != lhs) masm.moveSimd128(lhs, dest);
++      break;
++    case wasm::SimdOp::I8x16RelaxedSwizzle:
++      masm.swizzleInt8x16Relaxed(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8RelaxedQ15MulrS:
++      masm.q15MulrInt16x8Relaxed(lhs, rhs, dest);
++      break;
++    // Swizzle
++    case wasm::SimdOp::I8x16Swizzle:
++      masm.swizzleInt8x16(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8RelaxedDotI8x16I7x16S:
++      masm.dotInt8x16Int7x16(lhs, rhs, dest);
++      break;
++    default:
++      MOZ_CRASH("PPC64: NYI SIMD binary op");
++  }
++}
++void CodeGenerator::visitWasmBinarySimd128WithConstant(
++    LWasmBinarySimd128WithConstant* ins) {
++  FloatRegister lhs = ToFloatRegister(ins->lhs());
++  FloatRegister dest = ToFloatRegister(ins->output());
++  SimdConstant rhs = ins->rhs();
++  // Load the constant into scratch, then use the binary op.
++  ScratchSimd128Scope scratch(masm);
++  masm.loadConstantSimd128(rhs, scratch);
++  switch (ins->mir()->simdOp()) {
++    // Bitwise
++    case wasm::SimdOp::V128And:
++      masm.bitwiseAndSimd128(lhs, scratch, dest);
++      break;
++    case wasm::SimdOp::V128Or:
++      masm.bitwiseOrSimd128(lhs, scratch, dest);
++      break;
++    case wasm::SimdOp::V128Xor:
++      masm.bitwiseXorSimd128(lhs, scratch, dest);
++      break;
++    case wasm::SimdOp::V128AndNot:
++      masm.bitwiseAndNotSimd128(lhs, scratch, dest);
++      break;
++    // Integer add
++    case wasm::SimdOp::I8x16Add:
++      masm.addInt8x16(lhs, scratch, dest);
++      break;
++    case wasm::SimdOp::I16x8Add:
++      masm.addInt16x8(lhs, scratch, dest);
++      break;
++    case wasm::SimdOp::I32x4Add:
++      masm.addInt32x4(lhs, scratch, dest);
++      break;
++    case wasm::SimdOp::I64x2Add:
++      masm.addInt64x2(lhs, scratch, dest);
++      break;
++    // Integer sub
++    case wasm::SimdOp::I8x16Sub:
++      masm.subInt8x16(lhs, scratch, dest);
++      break;
++    case wasm::SimdOp::I16x8Sub:
++      masm.subInt16x8(lhs, scratch, dest);
++      break;
++    case wasm::SimdOp::I32x4Sub:
++      masm.subInt32x4(lhs, scratch, dest);
++      break;
++    case wasm::SimdOp::I64x2Sub:
++      masm.subInt64x2(lhs, scratch, dest);
++      break;
++    // Integer multiply (16-/32-bit lanes; I64x2 unreachable, see below)
++    case wasm::SimdOp::I16x8Mul:
++      masm.mulInt16x8(lhs, scratch, dest);
++      break;
++    case wasm::SimdOp::I32x4Mul:
++      masm.mulInt32x4(lhs, scratch, dest);
++      break;
++    case wasm::SimdOp::I64x2Mul:
++      // Unreachable on PPC64: MWasmBinarySimd128::specializeForConstantRhs
++      // returns false in Lowering-ppc64.cpp, so MIR with a constant rhs
++      // to I64x2Mul is never created on this backend.
++      //
++      // The previous in-place implementation was broken in three ways:
++      // hard-coded VR0/VR1 staging assumed an ordering that didn't match
++      // the surrounding code; a dead `mfvsrd(a, f0)` clobbered `a`
++      // immediately before the next mfvsrd; and the trailing
++      // `xxpermdi(dest, scratch, dest, 0)` with DM=0 placed lane-0 in the
++      // wrong half. Rather than ship dead-but-broken code, crash loudly
++      // if reachability ever changes — the future enabler must write a
++      // correct lowering (e.g. via masm.mulInt64x2 with explicit temps).
++      MOZ_CRASH("PPC64: I64x2Mul with constant rhs unimplemented "
++                "(specializeForConstantRhs returns false)");
++    // Compare
++    case wasm::SimdOp::I8x16Eq:
++      masm.compareInt8x16(Assembler::Equal, lhs, scratch, dest);
++      break;
++    case wasm::SimdOp::I8x16Ne:
++      masm.compareInt8x16(Assembler::NotEqual, lhs, scratch, dest);
++      break;
++    default:
++      MOZ_CRASH("PPC64: NYI SIMD binary-with-constant op");
++  }
++}
++void CodeGenerator::visitWasmVariableShiftSimd128(
++    LWasmVariableShiftSimd128* ins) {
++  FloatRegister lhs = ToFloatRegister(ins->lhs());
++  Register rhs = ToRegister(ins->rhs());
++  FloatRegister dest = ToFloatRegister(ins->output());
++  switch (ins->mir()->simdOp()) {
++    case wasm::SimdOp::I8x16Shl:
++      masm.leftShiftInt8x16(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I8x16ShrS:
++      masm.rightShiftInt8x16(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I8x16ShrU:
++      masm.unsignedRightShiftInt8x16(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8Shl:
++      masm.leftShiftInt16x8(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8ShrS:
++      masm.rightShiftInt16x8(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I16x8ShrU:
++      masm.unsignedRightShiftInt16x8(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4Shl:
++      masm.leftShiftInt32x4(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4ShrS:
++      masm.rightShiftInt32x4(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I32x4ShrU:
++      masm.unsignedRightShiftInt32x4(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I64x2Shl:
++      masm.leftShiftInt64x2(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I64x2ShrS:
++      masm.rightShiftInt64x2(lhs, rhs, dest);
++      break;
++    case wasm::SimdOp::I64x2ShrU:
++      masm.unsignedRightShiftInt64x2(lhs, rhs, dest);
++      break;
++    default:
++      MOZ_CRASH("PPC64: NYI SIMD variable shift op");
++  }
++}
++void CodeGenerator::visitWasmConstantShiftSimd128(
++    LWasmConstantShiftSimd128* ins) {
++  FloatRegister src = ToFloatRegister(ins->src());
++  FloatRegister dest = ToFloatRegister(ins->output());
++  int32_t shift = ins->shift();
++  switch (ins->mir()->simdOp()) {
++    case wasm::SimdOp::I8x16Shl:
++      masm.leftShiftInt8x16(Imm32(shift), src, dest);
++      break;
++    case wasm::SimdOp::I8x16ShrS:
++      masm.rightShiftInt8x16(Imm32(shift), src, dest);
++      break;
++    case wasm::SimdOp::I8x16ShrU:
++      masm.unsignedRightShiftInt8x16(Imm32(shift), src, dest);
++      break;
++    case wasm::SimdOp::I16x8Shl:
++      masm.leftShiftInt16x8(Imm32(shift), src, dest);
++      break;
++    case wasm::SimdOp::I16x8ShrS:
++      masm.rightShiftInt16x8(Imm32(shift), src, dest);
++      break;
++    case wasm::SimdOp::I16x8ShrU:
++      masm.unsignedRightShiftInt16x8(Imm32(shift), src, dest);
++      break;
++    case wasm::SimdOp::I32x4Shl:
++      masm.leftShiftInt32x4(Imm32(shift), src, dest);
++      break;
++    case wasm::SimdOp::I32x4ShrS:
++      masm.rightShiftInt32x4(Imm32(shift), src, dest);
++      break;
++    case wasm::SimdOp::I32x4ShrU:
++      masm.unsignedRightShiftInt32x4(Imm32(shift), src, dest);
++      break;
++    case wasm::SimdOp::I64x2Shl:
++      masm.leftShiftInt64x2(Imm32(shift), src, dest);
++      break;
++    case wasm::SimdOp::I64x2ShrS:
++      masm.rightShiftInt64x2(Imm32(shift), src, dest);
++      break;
++    case wasm::SimdOp::I64x2ShrU:
++      masm.unsignedRightShiftInt64x2(Imm32(shift), src, dest);
++      break;
++    default:
++      MOZ_CRASH("PPC64: NYI SIMD constant shift op");
++  }
++}
++void CodeGenerator::visitWasmSignReplicationSimd128(
++    LWasmSignReplicationSimd128* ins) {
++  FloatRegister src = ToFloatRegister(ins->src());
++  FloatRegister dest = ToFloatRegister(ins->output());
++  // Sign replication = arithmetic right shift by max amount (all sign bits).
++  switch (ins->mir()->simdOp()) {
++    case wasm::SimdOp::I8x16ShrS:
++      masm.rightShiftInt8x16(Imm32(7), src, dest);
++      break;
++    case wasm::SimdOp::I16x8ShrS:
++      masm.rightShiftInt16x8(Imm32(15), src, dest);
++      break;
++    case wasm::SimdOp::I32x4ShrS:
++      masm.rightShiftInt32x4(Imm32(31), src, dest);
++      break;
++    case wasm::SimdOp::I64x2ShrS:
++      masm.rightShiftInt64x2(Imm32(63), src, dest);
++      break;
++    default:
++      MOZ_CRASH("Unexpected sign replication op");
++  }
++}
++void CodeGenerator::visitWasmShuffleSimd128(LWasmShuffleSimd128* ins) {
++  FloatRegister lhs = ToFloatRegister(ins->lhs());
++  FloatRegister rhs = ToFloatRegister(ins->rhs());
++  FloatRegister dest = ToFloatRegister(ins->output());
++  SimdConstant ctrl = ins->control();
++  const uint8_t* lanes = reinterpret_cast<const uint8_t*>(ctrl.bytes());
++  masm.shuffleInt8x16(lanes, lhs, rhs, dest);
++}
++void CodeGenerator::visitWasmPermuteSimd128(LWasmPermuteSimd128* ins) {
++  FloatRegister src = ToFloatRegister(ins->src());
++  FloatRegister dest = ToFloatRegister(ins->output());
++  // PPC64: the shuffle analysis transforms control bytes into specialized
++  // formats. Reconstruct raw Wasm byte indices for our vperm implementation.
++  SimdConstant ctrl = ins->control();
++  uint8_t rawLanes[16];
++  switch (ins->op()) {
++    case SimdPermuteOp::MOVE:
++      masm.moveSimd128(src, dest);
++      return;
++    case SimdPermuteOp::PERMUTE_32x4: {
++      const int32_t* words = reinterpret_cast<const int32_t*>(ctrl.bytes());
++      for (int i = 0; i < 4; i++)
++        for (int j = 0; j < 4; j++)
++          rawLanes[i * 4 + j] = words[i] * 4 + j;
++      break;
++    }
++    case SimdPermuteOp::PERMUTE_16x8: {
++      // control has int16 halfword indices. High byte of halfs[0] may have
++      // platform-specific flags (Perm16x8Action). Mask to get the index only.
++      const int16_t* halfs = reinterpret_cast<const int16_t*>(ctrl.bytes());
++      for (int i = 0; i < 8; i++) {
++        int hwIdx = halfs[i] & 0x7;
++        rawLanes[i * 2] = hwIdx * 2;
++        rawLanes[i * 2 + 1] = hwIdx * 2 + 1;
++      }
++      break;
++    }
++    case SimdPermuteOp::BROADCAST_8x16: {
++      uint8_t lane = reinterpret_cast<const int8_t*>(ctrl.bytes())[0];
++      for (int i = 0; i < 16; i++) rawLanes[i] = lane;
++      break;
++    }
++    case SimdPermuteOp::BROADCAST_16x8: {
++      uint8_t lane = reinterpret_cast<const int8_t*>(ctrl.bytes())[0];
++      for (int i = 0; i < 8; i++) {
++        rawLanes[i * 2] = lane * 2;
++        rawLanes[i * 2 + 1] = lane * 2 + 1;
++      }
++      break;
++    }
++    case SimdPermuteOp::ROTATE_RIGHT_8x16: {
++      uint8_t shift = reinterpret_cast<const int8_t*>(ctrl.bytes())[0];
++      for (int i = 0; i < 16; i++) rawLanes[i] = (i + shift) % 16;
++      break;
++    }
++    case SimdPermuteOp::SHIFT_LEFT_8x16: {
++      // Shifted-out positions must be zero. Use index 16+ to pick from zero.
++      uint8_t shift = reinterpret_cast<const int8_t*>(ctrl.bytes())[0];
++      for (int i = 0; i < 16; i++)
++        rawLanes[i] = (i >= shift) ? (i - shift) : (16 + i);
++      goto needsZeroRhs;
++    }
++    case SimdPermuteOp::SHIFT_RIGHT_8x16: {
++      uint8_t shift = reinterpret_cast<const int8_t*>(ctrl.bytes())[0];
++      for (int i = 0; i < 16; i++)
++        rawLanes[i] = (i + shift < 16) ? (i + shift) : (16 + i);
++      goto needsZeroRhs;
++    }
++    case SimdPermuteOp::REVERSE_16x8: {
++      // Reverse bytes within each 16-bit lane: [1,0,3,2,5,4,...]
++      for (int i = 0; i < 8; i++) {
++        rawLanes[i * 2] = i * 2 + 1;
++        rawLanes[i * 2 + 1] = i * 2;
++      }
++      break;
++    }
++    case SimdPermuteOp::REVERSE_32x4: {
++      // Reverse bytes within each 32-bit lane: [3,2,1,0,7,6,5,4,...]
++      for (int i = 0; i < 4; i++)
++        for (int j = 0; j < 4; j++)
++          rawLanes[i * 4 + j] = i * 4 + (3 - j);
++      break;
++    }
++    case SimdPermuteOp::REVERSE_64x2: {
++      // Reverse bytes within each 64-bit lane: [7,6,5,4,3,2,1,0,15,...]
++      for (int i = 0; i < 2; i++)
++        for (int j = 0; j < 8; j++)
++          rawLanes[i * 8 + j] = i * 8 + (7 - j);
++      break;
++    }
++    case SimdPermuteOp::ZERO_EXTEND_8x16_TO_16x8:
++    case SimdPermuteOp::ZERO_EXTEND_8x16_TO_32x4:
++    case SimdPermuteOp::ZERO_EXTEND_8x16_TO_64x2:
++    case SimdPermuteOp::ZERO_EXTEND_16x8_TO_32x4:
++    case SimdPermuteOp::ZERO_EXTEND_16x8_TO_64x2:
++    case SimdPermuteOp::ZERO_EXTEND_32x4_TO_64x2: {
++      const int8_t* bytes = reinterpret_cast<const int8_t*>(ctrl.bytes());
++      for (int i = 0; i < 16; i++) rawLanes[i] = bytes[i];
++      goto needsZeroRhs;
++    }
++    default: {
++      // PERMUTE_8x16 and others: control has raw byte indices.
++      const int8_t* bytes = reinterpret_cast<const int8_t*>(ctrl.bytes());
++      for (int i = 0; i < 16; i++) rawLanes[i] = bytes[i];
++      break;
++    }
++  }
++  masm.shuffleInt8x16(rawLanes, src, src, dest);
++  return;
++
++  needsZeroRhs: {
++    // Wasm convention: rawLanes[i] in 0..15 selects src.LE_byte[idx], and
++    // rawLanes[i] >= 16 means "zero". Without spilling, we can't satisfy
++    // vperm's three-input constraint AND keep src alive when dest == src.
++    // Strategy: vperm src with itself (any valid byte for the "zero"
++    // positions, bytes get masked out below), then AND with a mask that
++    // zeros those positions.
++    int8_t ctrl[16], mask[16];
++    for (unsigned i = 0; i < 16; i++) {
++      uint8_t idx = rawLanes[i];
++      if (idx < 16) {
++        ctrl[i] = 15 - idx;
++        mask[i] = -1;
++      } else {
++        ctrl[i] = 0;
++        mask[i] = 0;
++      }
++    }
++    ScratchSimd128Scope scratch(masm);
++    masm.loadConstantSimd128(SimdConstant::CreateX16(ctrl), scratch);
++    masm.as_vperm(dest.encoding() & 31,
++                  src.encoding() & 31,
++                  src.encoding() & 31,
++                  scratch.encoding() & 31);
++    masm.loadConstantSimd128(SimdConstant::CreateX16(mask), scratch);
++    masm.as_xxland(dest, dest, scratch);
++    return;
++  }
++}
++void CodeGenerator::visitWasmReplaceLaneSimd128(LWasmReplaceLaneSimd128* ins) {
++  FloatRegister lhsDest = ToFloatRegister(ins->output());
++  MOZ_ASSERT(ToFloatRegister(ins->lhs()) == lhsDest);
++  uint32_t lane = ins->mir()->laneIndex();
++  switch (ins->mir()->simdOp()) {
++    case wasm::SimdOp::I8x16ReplaceLane:
++      masm.replaceLaneInt8x16(lane, ToRegister(ins->rhs()), lhsDest);
++      break;
++    case wasm::SimdOp::I16x8ReplaceLane:
++      masm.replaceLaneInt16x8(lane, ToRegister(ins->rhs()), lhsDest);
++      break;
++    case wasm::SimdOp::I32x4ReplaceLane:
++      masm.replaceLaneInt32x4(lane, ToRegister(ins->rhs()), lhsDest);
++      break;
++    case wasm::SimdOp::F32x4ReplaceLane:
++      masm.replaceLaneFloat32x4(lane, ToFloatRegister(ins->rhs()), lhsDest);
++      break;
++    case wasm::SimdOp::F64x2ReplaceLane:
++      masm.replaceLaneFloat64x2(lane, ToFloatRegister(ins->rhs()), lhsDest);
++      break;
++    default:
++      MOZ_CRASH("PPC64: NYI SIMD replace lane op");
++  }
++}
++void CodeGenerator::visitWasmReplaceInt64LaneSimd128(
++    LWasmReplaceInt64LaneSimd128* ins) {
++  MOZ_ASSERT(ins->mir()->simdOp() == wasm::SimdOp::I64x2ReplaceLane);
++  FloatRegister lhsDest = ToFloatRegister(ins->output());
++  MOZ_ASSERT(ToFloatRegister(ins->lhs()) == lhsDest);
++  masm.replaceLaneInt64x2(ins->mir()->laneIndex(),
++                          ToRegister64(ins->rhs()), lhsDest);
++}
++void CodeGenerator::visitWasmScalarToSimd128(LWasmScalarToSimd128* ins) {
++  FloatRegister dest = ToFloatRegister(ins->output());
++  switch (ins->mir()->simdOp()) {
++    case wasm::SimdOp::I8x16Splat:
++      masm.splatX16(ToRegister(ins->src()), dest);
++      break;
++    case wasm::SimdOp::I16x8Splat:
++      masm.splatX8(ToRegister(ins->src()), dest);
++      break;
++    case wasm::SimdOp::I32x4Splat:
++      masm.splatX4(ToRegister(ins->src()), dest);
++      break;
++    case wasm::SimdOp::F32x4Splat:
++      masm.splatX4(ToFloatRegister(ins->src()), dest);
++      break;
++    case wasm::SimdOp::F64x2Splat:
++      masm.splatX2(ToFloatRegister(ins->src()), dest);
++      break;
++    default:
++      MOZ_CRASH("PPC64: NYI SIMD scalar-to-simd op");
++  }
++}
++void CodeGenerator::visitWasmInt64ToSimd128(LWasmInt64ToSimd128* ins) {
++  FloatRegister dest = ToFloatRegister(ins->output());
++  switch (ins->mir()->simdOp()) {
++    case wasm::SimdOp::I64x2Splat:
++      masm.splatX2(ToRegister64(ins->src()), dest);
++      break;
++    default:
++      MOZ_CRASH("PPC64: NYI SIMD int64-to-simd op");
++  }
++}
++void CodeGenerator::visitWasmUnarySimd128(LWasmUnarySimd128* ins) {
++  FloatRegister src = ToFloatRegister(ins->src());
++  FloatRegister dest = ToFloatRegister(ins->output());
++  switch (ins->mir()->simdOp()) {
++    case wasm::SimdOp::I8x16Neg:
++      masm.negInt8x16(src, dest);
++      break;
++    case wasm::SimdOp::I16x8Neg:
++      masm.negInt16x8(src, dest);
++      break;
++    case wasm::SimdOp::I32x4Neg:
++      masm.negInt32x4(src, dest);
++      break;
++    case wasm::SimdOp::I64x2Neg:
++      masm.negInt64x2(src, dest);
++      break;
++    case wasm::SimdOp::I8x16Abs:
++      masm.absInt8x16(src, dest);
++      break;
++    case wasm::SimdOp::I16x8Abs:
++      masm.absInt16x8(src, dest);
++      break;
++    case wasm::SimdOp::I32x4Abs:
++      masm.absInt32x4(src, dest);
++      break;
++    case wasm::SimdOp::I64x2Abs:
++      masm.absInt64x2(src, dest);
++      break;
++    case wasm::SimdOp::V128Not:
++      masm.bitwiseNotSimd128(src, dest);
++      break;
++    case wasm::SimdOp::F32x4Neg:
++      masm.negFloat32x4(src, dest);
++      break;
++    case wasm::SimdOp::F64x2Neg:
++      masm.negFloat64x2(src, dest);
++      break;
++    case wasm::SimdOp::F32x4Abs:
++      masm.absFloat32x4(src, dest);
++      break;
++    case wasm::SimdOp::F64x2Abs:
++      masm.absFloat64x2(src, dest);
++      break;
++    case wasm::SimdOp::F32x4Sqrt:
++      masm.sqrtFloat32x4(src, dest);
++      break;
++    case wasm::SimdOp::F64x2Sqrt:
++      masm.sqrtFloat64x2(src, dest);
++      break;
++    case wasm::SimdOp::F32x4Ceil:
++      masm.ceilFloat32x4(src, dest);
++      break;
++    case wasm::SimdOp::F64x2Ceil:
++      masm.ceilFloat64x2(src, dest);
++      break;
++    case wasm::SimdOp::F32x4Floor:
++      masm.floorFloat32x4(src, dest);
++      break;
++    case wasm::SimdOp::F64x2Floor:
++      masm.floorFloat64x2(src, dest);
++      break;
++    case wasm::SimdOp::F32x4Trunc:
++      masm.truncFloat32x4(src, dest);
++      break;
++    case wasm::SimdOp::F64x2Trunc:
++      masm.truncFloat64x2(src, dest);
++      break;
++    case wasm::SimdOp::F32x4Nearest:
++      masm.nearestFloat32x4(src, dest);
++      break;
++    case wasm::SimdOp::F64x2Nearest:
++      masm.nearestFloat64x2(src, dest);
++      break;
++    // Conversions
++    case wasm::SimdOp::F32x4ConvertI32x4S:
++      masm.convertInt32x4ToFloat32x4(src, dest);
++      break;
++    case wasm::SimdOp::F32x4ConvertI32x4U:
++      masm.unsignedConvertInt32x4ToFloat32x4(src, dest);
++      break;
++    case wasm::SimdOp::I32x4TruncSatF32x4S:
++      masm.truncSatFloat32x4ToInt32x4(src, dest);
++      break;
++    case wasm::SimdOp::I32x4TruncSatF32x4U:
++      masm.unsignedTruncSatFloat32x4ToInt32x4(src, dest);
++      break;
++    case wasm::SimdOp::F64x2ConvertLowI32x4S:
++      masm.convertInt32x4ToFloat64x2(src, dest);
++      break;
++    case wasm::SimdOp::F64x2ConvertLowI32x4U:
++      masm.unsignedConvertInt32x4ToFloat64x2(src, dest);
++      break;
++    case wasm::SimdOp::F32x4DemoteF64x2Zero:
++      masm.convertFloat64x2ToFloat32x4(src, dest);
++      break;
++    case wasm::SimdOp::F64x2PromoteLowF32x4:
++      masm.convertFloat32x4ToFloat64x2(src, dest);
++      break;
++    case wasm::SimdOp::I32x4TruncSatF64x2SZero:
++      masm.truncSatFloat64x2ToInt32x4(src, dest, ScratchSimd128Reg);
++      break;
++    case wasm::SimdOp::I32x4TruncSatF64x2UZero:
++      masm.unsignedTruncSatFloat64x2ToInt32x4(src, dest, ScratchSimd128Reg);
++      break;
++    // Widen
++    case wasm::SimdOp::I16x8ExtendLowI8x16S:
++      masm.widenLowInt8x16(src, dest);
++      break;
++    case wasm::SimdOp::I16x8ExtendHighI8x16S:
++      masm.widenHighInt8x16(src, dest);
++      break;
++    case wasm::SimdOp::I16x8ExtendLowI8x16U:
++      masm.unsignedWidenLowInt8x16(src, dest);
++      break;
++    case wasm::SimdOp::I16x8ExtendHighI8x16U:
++      masm.unsignedWidenHighInt8x16(src, dest);
++      break;
++    case wasm::SimdOp::I32x4ExtendLowI16x8S:
++      masm.widenLowInt16x8(src, dest);
++      break;
++    case wasm::SimdOp::I32x4ExtendHighI16x8S:
++      masm.widenHighInt16x8(src, dest);
++      break;
++    case wasm::SimdOp::I32x4ExtendLowI16x8U:
++      masm.unsignedWidenLowInt16x8(src, dest);
++      break;
++    case wasm::SimdOp::I32x4ExtendHighI16x8U:
++      masm.unsignedWidenHighInt16x8(src, dest);
++      break;
++    case wasm::SimdOp::I64x2ExtendLowI32x4S:
++      masm.widenLowInt32x4(src, dest);
++      break;
++    case wasm::SimdOp::I64x2ExtendHighI32x4S:
++      masm.widenHighInt32x4(src, dest);
++      break;
++    case wasm::SimdOp::I64x2ExtendLowI32x4U:
++      masm.unsignedWidenLowInt32x4(src, dest);
++      break;
++    case wasm::SimdOp::I64x2ExtendHighI32x4U:
++      masm.unsignedWidenHighInt32x4(src, dest);
++      break;
++    // Extended add pairwise
++    case wasm::SimdOp::I16x8ExtaddPairwiseI8x16S:
++      masm.extAddPairwiseInt8x16(src, dest);
++      break;
++    case wasm::SimdOp::I16x8ExtaddPairwiseI8x16U:
++      masm.unsignedExtAddPairwiseInt8x16(src, dest);
++      break;
++    case wasm::SimdOp::I32x4ExtaddPairwiseI16x8S:
++      masm.extAddPairwiseInt16x8(src, dest);
++      break;
++    case wasm::SimdOp::I32x4ExtaddPairwiseI16x8U:
++      masm.unsignedExtAddPairwiseInt16x8(src, dest);
++      break;
++    // Relaxed truncation
++    case wasm::SimdOp::I32x4RelaxedTruncF32x4S:
++      masm.truncFloat32x4ToInt32x4Relaxed(src, dest);
++      break;
++    case wasm::SimdOp::I32x4RelaxedTruncF32x4U:
++      masm.unsignedTruncFloat32x4ToInt32x4Relaxed(src, dest);
++      break;
++    case wasm::SimdOp::I32x4RelaxedTruncF64x2SZero:
++      masm.truncFloat64x2ToInt32x4Relaxed(src, dest);
++      break;
++    case wasm::SimdOp::I32x4RelaxedTruncF64x2UZero:
++      masm.unsignedTruncFloat64x2ToInt32x4Relaxed(src, dest);
++      break;
++    // Popcnt
++    case wasm::SimdOp::I8x16Popcnt:
++      masm.popcntInt8x16(src, dest);
++      break;
++    default:
++      MOZ_CRASH("PPC64: NYI SIMD unary op");
++  }
++}
++void CodeGenerator::visitWasmReduceSimd128(LWasmReduceSimd128* ins) {
++  FloatRegister src = ToFloatRegister(ins->src());
++  uint32_t imm = ins->mir()->imm();
++  switch (ins->mir()->simdOp()) {
++    case wasm::SimdOp::I8x16ExtractLaneS:
++      masm.extractLaneInt8x16(imm, src, ToRegister(ins->output()));
++      break;
++    case wasm::SimdOp::I8x16ExtractLaneU:
++      masm.unsignedExtractLaneInt8x16(imm, src, ToRegister(ins->output()));
++      break;
++    case wasm::SimdOp::I16x8ExtractLaneS:
++      masm.extractLaneInt16x8(imm, src, ToRegister(ins->output()));
++      break;
++    case wasm::SimdOp::I16x8ExtractLaneU:
++      masm.unsignedExtractLaneInt16x8(imm, src, ToRegister(ins->output()));
++      break;
++    case wasm::SimdOp::I32x4ExtractLane:
++      masm.extractLaneInt32x4(imm, src, ToRegister(ins->output()));
++      break;
++    case wasm::SimdOp::F32x4ExtractLane:
++      masm.extractLaneFloat32x4(imm, src, ToFloatRegister(ins->output()));
++      break;
++    case wasm::SimdOp::F64x2ExtractLane:
++      masm.extractLaneFloat64x2(imm, src, ToFloatRegister(ins->output()));
++      break;
++    case wasm::SimdOp::V128AnyTrue:
++      masm.anyTrueSimd128(src, ToRegister(ins->output()));
++      break;
++    case wasm::SimdOp::I8x16AllTrue:
++      masm.allTrueInt8x16(src, ToRegister(ins->output()));
++      break;
++    case wasm::SimdOp::I16x8AllTrue:
++      masm.allTrueInt16x8(src, ToRegister(ins->output()));
++      break;
++    case wasm::SimdOp::I32x4AllTrue:
++      masm.allTrueInt32x4(src, ToRegister(ins->output()));
++      break;
++    case wasm::SimdOp::I64x2AllTrue:
++      masm.allTrueInt64x2(src, ToRegister(ins->output()));
++      break;
++    case wasm::SimdOp::I8x16Bitmask:
++      masm.bitmaskInt8x16(src, ToRegister(ins->output()), ScratchSimd128Reg);
++      break;
++    case wasm::SimdOp::I16x8Bitmask:
++      masm.bitmaskInt16x8(src, ToRegister(ins->output()), ScratchSimd128Reg);
++      break;
++    case wasm::SimdOp::I32x4Bitmask:
++      masm.bitmaskInt32x4(src, ToRegister(ins->output()), ScratchSimd128Reg);
++      break;
++    case wasm::SimdOp::I64x2Bitmask:
++      masm.bitmaskInt64x2(src, ToRegister(ins->output()), ScratchSimd128Reg);
++      break;
++    default:
++      MOZ_CRASH("PPC64: NYI SIMD reduce op");
++  }
++}
++void CodeGenerator::visitWasmReduceAndBranchSimd128(
++    LWasmReduceAndBranchSimd128* ins) {
++  FloatRegister src = ToFloatRegister(ins->src());
++  UseScratchRegisterScope temps(masm);
++  Register tmp = temps.Acquire();
++  switch (ins->simdOp()) {
++    case wasm::SimdOp::V128AnyTrue:
++      masm.anyTrueSimd128(src, tmp);
++      break;
++    case wasm::SimdOp::I8x16AllTrue:
++      masm.allTrueInt8x16(src, tmp);
++      break;
++    case wasm::SimdOp::I16x8AllTrue:
++      masm.allTrueInt16x8(src, tmp);
++      break;
++    case wasm::SimdOp::I32x4AllTrue:
++      masm.allTrueInt32x4(src, tmp);
++      break;
++    case wasm::SimdOp::I64x2AllTrue:
++      masm.allTrueInt64x2(src, tmp);
++      break;
++    case wasm::SimdOp::I8x16Bitmask:
++      masm.bitmaskInt8x16(src, tmp, ScratchSimd128Reg);
++      break;
++    case wasm::SimdOp::I16x8Bitmask:
++      masm.bitmaskInt16x8(src, tmp, ScratchSimd128Reg);
++      break;
++    case wasm::SimdOp::I32x4Bitmask:
++      masm.bitmaskInt32x4(src, tmp, ScratchSimd128Reg);
++      break;
++    case wasm::SimdOp::I64x2Bitmask:
++      masm.bitmaskInt64x2(src, tmp, ScratchSimd128Reg);
++      break;
++    default:
++      MOZ_CRASH("PPC64: NYI SIMD reduce-and-branch op");
++  }
++  masm.as_cmpdi(tmp, 0);
++  // Branch to ifTrue if nonzero, fall through to ifFalse.
++  Label* ifTrue = skipTrivialBlocks(ins->ifTrue())->lir()->label();
++  Label* ifFalse = skipTrivialBlocks(ins->ifFalse())->lir()->label();
++  masm.ma_b(Assembler::NotEqual, ifTrue);
++  masm.jump(ifFalse);
++}
++void CodeGenerator::visitWasmReduceSimd128ToInt64(
++    LWasmReduceSimd128ToInt64* ins) {
++  FloatRegister src = ToFloatRegister(ins->src());
++  Register64 dest = ToOutRegister64(ins);
++  switch (ins->mir()->simdOp()) {
++    case wasm::SimdOp::I64x2ExtractLane:
++      masm.extractLaneInt64x2(ins->mir()->imm(), src, dest);
++      break;
++    default:
++      MOZ_CRASH("PPC64: NYI SIMD reduce-to-int64 op");
++  }
++}
++static inline wasm::MemoryAccessDesc DeriveMemoryAccessDesc(
++    const wasm::MemoryAccessDesc& access, Scalar::Type type) {
++  return wasm::MemoryAccessDesc(access.memoryIndex(), type, access.align(),
++                                access.offset32(), access.trapDesc(),
++                                access.isHugeMemory());
++}
++
++void CodeGenerator::visitWasmLoadLaneSimd128(LWasmLoadLaneSimd128* ins) {
++  const MWasmLoadLaneSimd128* mir = ins->mir();
++  Register memoryBase = ToRegister(ins->memoryBase());
++  Register ptr = ToRegister(ins->ptr());
++  FloatRegister src = ToFloatRegister(ins->src());
++  FloatRegister dest = ToFloatRegister(ins->output());
++  UseScratchRegisterScope temps(masm);
++  Register tmp = temps.Acquire();
++  masm.moveSimd128(src, dest);
++  switch (mir->laneSize()) {
++    case 1:
++      masm.wasmLoad(DeriveMemoryAccessDesc(mir->access(), Scalar::Int8),
++                    memoryBase, ptr, ptr, AnyRegister(tmp));
++      masm.replaceLaneInt8x16(mir->laneIndex(), tmp, dest);
++      break;
++    case 2:
++      masm.wasmLoad(DeriveMemoryAccessDesc(mir->access(), Scalar::Int16),
++                    memoryBase, ptr, ptr, AnyRegister(tmp));
++      masm.replaceLaneInt16x8(mir->laneIndex(), tmp, dest);
++      break;
++    case 4:
++      masm.wasmLoad(DeriveMemoryAccessDesc(mir->access(), Scalar::Int32),
++                    memoryBase, ptr, ptr, AnyRegister(tmp));
++      masm.replaceLaneInt32x4(mir->laneIndex(), tmp, dest);
++      break;
++    case 8: {
++      masm.wasmLoadI64(DeriveMemoryAccessDesc(mir->access(), Scalar::Int64),
++                       memoryBase, ptr, ptr,
++                       Register64(tmp));
++      masm.replaceLaneInt64x2(mir->laneIndex(), Register64(tmp), dest);
++      break;
++    }
++    default:
++      MOZ_CRASH("Unexpected lane size");
++  }
++}
++void CodeGenerator::visitWasmStoreLaneSimd128(LWasmStoreLaneSimd128* ins) {
++  const MWasmStoreLaneSimd128* mir = ins->mir();
++  Register memoryBase = ToRegister(ins->memoryBase());
++  Register ptr = ToRegister(ins->ptr());
++  FloatRegister src = ToFloatRegister(ins->src());
++  UseScratchRegisterScope temps(masm);
++  Register tmp = temps.Acquire();
++  switch (mir->laneSize()) {
++    case 1:
++      masm.unsignedExtractLaneInt8x16(mir->laneIndex(), src, tmp);
++      masm.wasmStore(DeriveMemoryAccessDesc(mir->access(), Scalar::Int8),
++                     AnyRegister(tmp), memoryBase, ptr, ptr);
++      break;
++    case 2:
++      masm.unsignedExtractLaneInt16x8(mir->laneIndex(), src, tmp);
++      masm.wasmStore(DeriveMemoryAccessDesc(mir->access(), Scalar::Int16),
++                     AnyRegister(tmp), memoryBase, ptr, ptr);
++      break;
++    case 4:
++      masm.extractLaneInt32x4(mir->laneIndex(), src, tmp);
++      masm.wasmStore(DeriveMemoryAccessDesc(mir->access(), Scalar::Int32),
++                     AnyRegister(tmp), memoryBase, ptr, ptr);
++      break;
++    case 8:
++      masm.extractLaneInt64x2(mir->laneIndex(), src, Register64(tmp));
++      masm.wasmStoreI64(DeriveMemoryAccessDesc(mir->access(), Scalar::Int64),
++                        Register64(tmp), memoryBase, ptr, ptr);
++      break;
++    default:
++      MOZ_CRASH("Unexpected lane size");
++  }
++}
++
++}  // namespace jit
++}  // namespace js
+diff --git a/js/src/jit/ppc64/CodeGenerator-ppc64.h b/js/src/jit/ppc64/CodeGenerator-ppc64.h
+new file mode 100644
+index 000000000000..3414eceb5ac4
+--- /dev/null
++++ b/js/src/jit/ppc64/CodeGenerator-ppc64.h
+@@ -0,0 +1,101 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_CodeGenerator_ppc64_h
++#define jit_ppc64_CodeGenerator_ppc64_h
++
++#include "jit/ppc64/Assembler-ppc64.h"
++#include "jit/shared/CodeGenerator-shared.h"
++
++namespace js {
++namespace jit {
++
++class CodeGeneratorPPC64;
++class OutOfLineTableSwitch;
++
++using OutOfLineWasmTruncateCheck =
++    OutOfLineWasmTruncateCheckBase<CodeGeneratorPPC64>;
++
++class CodeGeneratorPPC64 : public CodeGeneratorShared {
++  friend class MoveResolverPPC64;
++
++ protected:
++  CodeGeneratorPPC64(MIRGenerator* gen, LIRGraph* graph, MacroAssembler* masm,
++                     const wasm::CodeMetadata* codeMeta);
++
++  NonAssertingLabel deoptLabel_;
++
++  Operand ToOperand(const LAllocation& a);
++  Operand ToOperand(const LAllocation* a);
++  MoveOperand toMoveOperand(LAllocation a) const;
++
++  template <typename T1, typename T2>
++  void bailoutCmp32(Assembler::Condition c, T1 lhs, T2 rhs,
++                    LSnapshot* snapshot) {
++    Label bail;
++    masm.branch32(c, lhs, rhs, &bail);
++    bailoutFrom(&bail, snapshot);
++  }
++  template <typename T1, typename T2>
++  void bailoutCmpPtr(Assembler::Condition c, T1 lhs, T2 rhs,
++                     LSnapshot* snapshot) {
++    Label bail;
++    masm.branchPtr(c, lhs, rhs, &bail);
++    bailoutFrom(&bail, snapshot);
++  }
++  template <typename T1, typename T2>
++  void bailoutTest32(Assembler::Condition c, T1 lhs, T2 rhs,
++                     LSnapshot* snapshot) {
++    Label bail;
++    masm.branchTest32(c, lhs, rhs, &bail);
++    bailoutFrom(&bail, snapshot);
++  }
++  void bailoutIfFalseBool(Register lhs, LSnapshot* snapshot);
++  void bailoutFrom(Label* label, LSnapshot* snapshot);
++  void bailout(LSnapshot* snapshot);
++
++ protected:
++  bool generateOutOfLineCode();
++  void branchToBlock(MBasicBlock* block);
++
++  template <typename T>
++  void branchToBlock(Assembler::Condition cond, Register lhs, T rhs,
++                     MBasicBlock* mir) {
++    Label* label = skipTrivialBlocks(mir)->lir()->label();
++    masm.branch32(cond, lhs, rhs, label);
++  }
++  void branchToBlock(Assembler::DoubleCondition cond, FloatRegister lhs,
++                     FloatRegister rhs, MBasicBlock* mir);
++  void branchToBlock(Assembler::FloatFormat fmt,
++                     Assembler::DoubleCondition cond, FloatRegister lhs,
++                     FloatRegister rhs, MBasicBlock* mir);
++
++  void emitTableSwitchDispatch(MTableSwitch* mir, Register index,
++                               Register base);
++
++  void emitBigIntPtrDiv(LBigIntPtrDiv* ins, Register dividend, Register divisor,
++                        Register output);
++  void emitBigIntPtrMod(LBigIntPtrMod* ins, Register dividend, Register divisor,
++                        Register output);
++
++  void generateInvalidateEpilogue();
++
++  template <typename T>
++  void emitWasmLoad(T* lir);
++  template <typename T>
++  void emitWasmStore(T* lir);
++
++ public:
++  void visitOutOfLineTableSwitch(OutOfLineTableSwitch* ool);
++  void visitOutOfLineWasmTruncateCheck(OutOfLineWasmTruncateCheck* ool);
++};
++
++typedef CodeGeneratorPPC64 CodeGeneratorSpecific;
++
++}  // namespace jit
++}  // namespace js
++
++#endif /* jit_ppc64_CodeGenerator_ppc64_h */
+diff --git a/js/src/jit/ppc64/LIR-ppc64.h b/js/src/jit/ppc64/LIR-ppc64.h
+new file mode 100644
+index 000000000000..686875056127
+--- /dev/null
++++ b/js/src/jit/ppc64/LIR-ppc64.h
+@@ -0,0 +1,135 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_LIR_ppc64_h
++#define jit_ppc64_LIR_ppc64_h
++
++namespace js {
++namespace jit {
++
++class LUnbox : public LInstructionHelper<1, BOX_PIECES, 0> {
++ public:
++  LIR_HEADER(Unbox);
++
++  explicit LUnbox(const LAllocation& input) : LInstructionHelper(classOpcode) {
++    setOperand(0, input);
++  }
++
++  static const size_t Input = 0;
++
++  LBoxAllocation input() const { return getBoxOperand(Input); }
++
++  MUnbox* mir() const { return mir_->toUnbox(); }
++  const char* extraName() const { return StringFromMIRType(mir()->type()); }
++};
++
++class LUDivOrMod : public LBinaryMath<0> {
++ public:
++  LIR_HEADER(UDivOrMod);
++
++  LUDivOrMod() : LBinaryMath(classOpcode) {}
++
++  MBinaryArithInstruction* mir() const {
++    MOZ_ASSERT(mir_->isDiv() || mir_->isMod());
++    return static_cast<MBinaryArithInstruction*>(mir_);
++  }
++
++  bool canBeDivideByZero() const {
++    if (mir_->isMod()) {
++      return mir_->toMod()->canBeDivideByZero();
++    }
++    return mir_->toDiv()->canBeDivideByZero();
++  }
++
++  bool trapOnError() const {
++    if (mir_->isMod()) {
++      return mir_->toMod()->trapOnError();
++    }
++    return mir_->toDiv()->trapOnError();
++  }
++
++  wasm::TrapSiteDesc trapSiteDesc() const {
++    MOZ_ASSERT(mir_->isDiv() || mir_->isMod());
++    if (mir_->isMod()) {
++      return mir_->toMod()->trapSiteDesc();
++    }
++    return mir_->toDiv()->trapSiteDesc();
++  }
++};
++
++class LDivOrModI64 : public LBinaryMath<0> {
++ public:
++  LIR_HEADER(DivOrModI64);
++
++  LDivOrModI64(const LAllocation& lhs, const LAllocation& rhs)
++      : LBinaryMath(classOpcode) {
++    setOperand(0, lhs);
++    setOperand(1, rhs);
++  }
++
++  MBinaryArithInstruction* mir() const {
++    MOZ_ASSERT(mir_->isDiv() || mir_->isMod());
++    return static_cast<MBinaryArithInstruction*>(mir_);
++  }
++
++  bool canBeDivideByZero() const {
++    if (mir_->isMod()) {
++      return mir_->toMod()->canBeDivideByZero();
++    }
++    return mir_->toDiv()->canBeDivideByZero();
++  }
++  bool canBeNegativeOverflow() const {
++    if (mir_->isMod()) {
++      return mir_->toMod()->canBeNegativeDividend();
++    }
++    return mir_->toDiv()->canBeNegativeOverflow();
++  }
++  wasm::TrapSiteDesc trapSiteDesc() const {
++    MOZ_ASSERT(mir_->isDiv() || mir_->isMod());
++    if (mir_->isMod()) {
++      return mir_->toMod()->trapSiteDesc();
++    }
++    return mir_->toDiv()->trapSiteDesc();
++  }
++};
++
++class LUDivOrModI64 : public LBinaryMath<0> {
++ public:
++  LIR_HEADER(UDivOrModI64);
++
++  LUDivOrModI64(const LAllocation& lhs, const LAllocation& rhs)
++      : LBinaryMath(classOpcode) {
++    setOperand(0, lhs);
++    setOperand(1, rhs);
++  }
++
++  const char* extraName() const {
++    return mir()->isTruncated() ? "Truncated" : nullptr;
++  }
++
++  MBinaryArithInstruction* mir() const {
++    MOZ_ASSERT(mir_->isDiv() || mir_->isMod());
++    return static_cast<MBinaryArithInstruction*>(mir_);
++  }
++  bool canBeDivideByZero() const {
++    if (mir_->isMod()) {
++      return mir_->toMod()->canBeDivideByZero();
++    }
++    return mir_->toDiv()->canBeDivideByZero();
++  }
++  wasm::TrapSiteDesc trapSiteDesc() const {
++    MOZ_ASSERT(mir_->isDiv() || mir_->isMod());
++    if (mir_->isMod()) {
++      return mir_->toMod()->trapSiteDesc();
++    }
++    return mir_->toDiv()->trapSiteDesc();
++  }
++};
++
++}  // namespace jit
++}  // namespace js
++
++#endif /* jit_ppc64_LIR_ppc64_h */
+diff --git a/js/src/jit/ppc64/Lowering-ppc64.cpp b/js/src/jit/ppc64/Lowering-ppc64.cpp
+new file mode 100644
+index 000000000000..be0ead19d273
+--- /dev/null
++++ b/js/src/jit/ppc64/Lowering-ppc64.cpp
+@@ -0,0 +1,1324 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#include "jit/ppc64/Lowering-ppc64.h"
++
++#include "mozilla/MathAlgorithms.h"
++
++#include "jit/Lowering.h"
++#include "jit/MIR-wasm.h"
++#include "jit/MIR.h"
++#include "jit/ppc64/Assembler-ppc64.h"
++#include "wasm/WasmFeatures.h"  // for wasm::ReportSimdAnalysis
++
++#include "jit/shared/Lowering-shared-inl.h"
++
++using namespace js;
++using namespace js::jit;
++
++using mozilla::FloorLog2;
++
++namespace js {
++namespace jit {
++
++LTableSwitch* LIRGeneratorPPC64::newLTableSwitch(const LAllocation& in,
++                                                 const LDefinition& inputCopy) {
++  return new (alloc()) LTableSwitch(in, inputCopy, temp());
++}
++
++LTableSwitchV* LIRGeneratorPPC64::newLTableSwitchV(const LBoxAllocation& in) {
++  return new (alloc()) LTableSwitchV(in, temp(), tempDouble(), temp());
++}
++
++void LIRGeneratorPPC64::lowerForShift(LInstructionHelper<1, 2, 0>* ins,
++                                      MDefinition* mir, MDefinition* lhs,
++                                      MDefinition* rhs) {
++  lowerForALU(ins, mir, lhs, rhs);
++}
++
++template <class LInstr>
++void LIRGeneratorPPC64::lowerForShiftInt64(LInstr* ins, MDefinition* mir,
++                                           MDefinition* lhs, MDefinition* rhs) {
++  if constexpr (std::is_same_v<LInstr, LShiftI64>) {
++    ins->setLhs(useInt64RegisterAtStart(lhs));
++    ins->setRhs(useRegisterOrConstantAtStart(rhs));
++  } else {
++    ins->setInput(useInt64RegisterAtStart(lhs));
++    ins->setCount(useRegisterOrConstantAtStart(rhs));
++  }
++  defineInt64(ins, mir);
++}
++
++template void LIRGeneratorPPC64::lowerForShiftInt64(LShiftI64* ins,
++                                                    MDefinition* mir,
++                                                    MDefinition* lhs,
++                                                    MDefinition* rhs);
++template void LIRGeneratorPPC64::lowerForShiftInt64(LRotateI64* ins,
++                                                    MDefinition* mir,
++                                                    MDefinition* lhs,
++                                                    MDefinition* rhs);
++
++void LIRGeneratorPPC64::lowerForALU(LInstructionHelper<1, 1, 0>* ins,
++                                    MDefinition* mir, MDefinition* input) {
++  ins->setOperand(0, useRegisterAtStart(input));
++  define(ins, mir);
++}
++
++void LIRGeneratorPPC64::lowerForALU(LInstructionHelper<1, 2, 0>* ins,
++                                    MDefinition* mir, MDefinition* lhs,
++                                    MDefinition* rhs) {
++  ins->setOperand(0, useRegisterAtStart(lhs));
++  ins->setOperand(1, useRegisterOrConstantAtStart(rhs));
++  define(ins, mir);
++}
++
++void LIRGeneratorPPC64::lowerForALUInt64(
++    LInstructionHelper<INT64_PIECES, INT64_PIECES, 0>* ins, MDefinition* mir,
++    MDefinition* input) {
++  ins->setInt64Operand(0, useInt64RegisterAtStart(input));
++  defineInt64(ins, mir);
++}
++
++void LIRGeneratorPPC64::lowerForALUInt64(
++    LInstructionHelper<INT64_PIECES, 2 * INT64_PIECES, 0>* ins,
++    MDefinition* mir, MDefinition* lhs, MDefinition* rhs) {
++  ins->setInt64Operand(0, useInt64RegisterAtStart(lhs));
++  ins->setInt64Operand(INT64_PIECES, useInt64RegisterOrConstantAtStart(rhs));
++  defineInt64(ins, mir);
++}
++
++void LIRGeneratorPPC64::lowerForMulInt64(LMulI64* ins, MMul* mir,
++                                         MDefinition* lhs, MDefinition* rhs) {
++  lowerForALUInt64(ins, mir, lhs, rhs);
++}
++
++void LIRGeneratorPPC64::lowerForFPU(LInstructionHelper<1, 1, 0>* ins,
++                                    MDefinition* mir, MDefinition* input) {
++  ins->setOperand(0, useRegisterAtStart(input));
++  define(ins, mir);
++}
++
++void LIRGeneratorPPC64::lowerForFPU(LInstructionHelper<1, 2, 0>* ins,
++                                    MDefinition* mir, MDefinition* lhs,
++                                    MDefinition* rhs) {
++  ins->setOperand(0, useRegisterAtStart(lhs));
++  ins->setOperand(1, useRegisterAtStart(rhs));
++  define(ins, mir);
++}
++
++LBoxAllocation LIRGeneratorPPC64::useBoxFixed(MDefinition* mir, Register reg1,
++                                              Register reg2, bool useAtStart) {
++  MOZ_ASSERT(mir->type() == MIRType::Value);
++
++  ensureDefined(mir);
++  return LBoxAllocation(LUse(reg1, mir->virtualRegister(), useAtStart));
++}
++
++LAllocation LIRGeneratorPPC64::useByteOpRegister(MDefinition* mir) {
++  return useRegister(mir);
++}
++
++LAllocation LIRGeneratorPPC64::useByteOpRegisterAtStart(MDefinition* mir) {
++  return useRegisterAtStart(mir);
++}
++
++LAllocation LIRGeneratorPPC64::useByteOpRegisterOrNonDoubleConstant(
++    MDefinition* mir) {
++  return useRegisterOrNonDoubleConstant(mir);
++}
++
++LDefinition LIRGeneratorPPC64::tempByteOpRegister() { return temp(); }
++
++LDefinition LIRGeneratorPPC64::tempToUnbox() { return temp(); }
++
++void LIRGeneratorPPC64::lowerUntypedPhiInput(MPhi* phi, uint32_t inputPosition,
++                                             LBlock* block, size_t lirIndex) {
++  lowerTypedPhiInput(phi, inputPosition, block, lirIndex);
++}
++
++void LIRGeneratorPPC64::lowerInt64PhiInput(MPhi* phi, uint32_t inputPosition,
++                                           LBlock* block, size_t lirIndex) {
++  lowerTypedPhiInput(phi, inputPosition, block, lirIndex);
++}
++
++void LIRGeneratorPPC64::defineInt64Phi(MPhi* phi, size_t lirIndex) {
++  defineTypedPhi(phi, lirIndex);
++}
++
++void LIRGeneratorPPC64::lowerMulI(MMul* mul, MDefinition* lhs,
++                                  MDefinition* rhs) {
++  LMulI* lir = new (alloc()) LMulI;
++  if (mul->fallible()) {
++    assignSnapshot(lir, mul->bailoutKind());
++  }
++  if (mul->canBeNegativeZero() && !rhs->isConstant()) {
++    lir->setOperand(0, useRegister(lhs));
++    lir->setOperand(1, useRegister(rhs));
++    define(lir, mul);
++    return;
++  }
++  lowerForALU(lir, mul, lhs, rhs);
++}
++
++void LIRGeneratorPPC64::lowerDivI(MDiv* div) {
++  if (div->rhs()->isConstant()) {
++    int32_t rhs = div->rhs()->toConstant()->toInt32();
++    int32_t shift = FloorLog2(uint32_t(rhs));
++    if (rhs > 0 && 1 << shift == rhs) {
++      LDivPowTwoI* lir =
++          new (alloc()) LDivPowTwoI(useRegister(div->lhs()), shift);
++      if (div->fallible()) {
++        assignSnapshot(lir, div->bailoutKind());
++      }
++      define(lir, div);
++      return;
++    }
++  }
++  LDivI* lir = new (alloc())
++      LDivI(useRegister(div->lhs()), useRegister(div->rhs()), temp());
++  if (div->fallible()) {
++    assignSnapshot(lir, div->bailoutKind());
++  }
++  define(lir, div);
++}
++
++void LIRGeneratorPPC64::lowerDivI64(MDiv* div) {
++  auto* lir = new (alloc())
++      LDivOrModI64(useRegister(div->lhs()), useRegister(div->rhs()));
++  defineInt64(lir, div);
++}
++
++void LIRGeneratorPPC64::lowerModI(MMod* mod) {
++  if (mod->rhs()->isConstant()) {
++    int32_t rhs = mod->rhs()->toConstant()->toInt32();
++    int32_t shift = FloorLog2(uint32_t(rhs));
++    if (rhs > 0 && 1 << shift == rhs) {
++      LModPowTwoI* lir =
++          new (alloc()) LModPowTwoI(useRegister(mod->lhs()), shift);
++      if (mod->fallible()) {
++        assignSnapshot(lir, mod->bailoutKind());
++      }
++      define(lir, mod);
++      return;
++    } else if (shift < 31 && (1 << (shift + 1)) - 1 == rhs) {
++      LModMaskI* lir = new (alloc())
++          LModMaskI(useRegister(mod->lhs()), temp(), temp(), shift + 1);
++      if (mod->fallible()) {
++        assignSnapshot(lir, mod->bailoutKind());
++      }
++      define(lir, mod);
++      return;
++    }
++  }
++  auto* lir =
++      new (alloc()) LModI(useRegister(mod->lhs()), useRegister(mod->rhs()));
++  if (mod->fallible()) {
++    assignSnapshot(lir, mod->bailoutKind());
++  }
++  define(lir, mod);
++}
++
++void LIRGeneratorPPC64::lowerModI64(MMod* mod) {
++  auto* lir = new (alloc())
++      LDivOrModI64(useRegister(mod->lhs()), useRegister(mod->rhs()));
++  defineInt64(lir, mod);
++}
++
++void LIRGeneratorPPC64::lowerUDiv(MDiv* div) {
++  MDefinition* lhs = div->getOperand(0);
++  MDefinition* rhs = div->getOperand(1);
++  LUDivOrMod* lir = new (alloc()) LUDivOrMod;
++  // useRegisterAtStart: CodeGenerator-ppc64's visitUDivOrMod zero-extends
++  // lhs/rhs into their own slots in place before the 32-bit divwu, so the
++  // inputs must not be required live after the LIR op begins.
++  lir->setOperand(0, useRegisterAtStart(lhs));
++  lir->setOperand(1, useRegisterAtStart(rhs));
++  if (div->fallible()) {
++    assignSnapshot(lir, div->bailoutKind());
++  }
++  define(lir, div);
++}
++
++void LIRGeneratorPPC64::lowerUDivI64(MDiv* div) {
++  auto* lir = new (alloc())
++      LUDivOrModI64(useRegister(div->lhs()), useRegister(div->rhs()));
++  defineInt64(lir, div);
++}
++
++void LIRGeneratorPPC64::lowerUMod(MMod* mod) {
++  MDefinition* lhs = mod->getOperand(0);
++  MDefinition* rhs = mod->getOperand(1);
++  LUDivOrMod* lir = new (alloc()) LUDivOrMod;
++  // See lowerUDiv above for why useRegisterAtStart is required here.
++  lir->setOperand(0, useRegisterAtStart(lhs));
++  lir->setOperand(1, useRegisterAtStart(rhs));
++  if (mod->fallible()) {
++    assignSnapshot(lir, mod->bailoutKind());
++  }
++  define(lir, mod);
++}
++
++void LIRGeneratorPPC64::lowerUModI64(MMod* mod) {
++  auto* lir = new (alloc())
++      LUDivOrModI64(useRegister(mod->lhs()), useRegister(mod->rhs()));
++  defineInt64(lir, mod);
++}
++
++void LIRGeneratorPPC64::lowerUrshD(MUrsh* mir) {
++  MDefinition* lhs = mir->lhs();
++  MDefinition* rhs = mir->rhs();
++  MOZ_ASSERT(lhs->type() == MIRType::Int32);
++  MOZ_ASSERT(rhs->type() == MIRType::Int32);
++  auto* lir = new (alloc()) LUrshD(useRegisterAtStart(lhs),
++                                   useRegisterOrConstantAtStart(rhs), temp());
++  define(lir, mir);
++}
++
++void LIRGeneratorPPC64::lowerPowOfTwoI(MPow* mir) {
++  int32_t base = mir->input()->toConstant()->toInt32();
++  MDefinition* power = mir->power();
++  auto* lir = new (alloc()) LPowOfTwoI(useRegister(power), base);
++  assignSnapshot(lir, mir->bailoutKind());
++  define(lir, mir);
++}
++
++void LIRGeneratorPPC64::lowerBigIntPtrDiv(MBigIntPtrDiv* ins) {
++  auto* lir = new (alloc())
++      LBigIntPtrDiv(useRegister(ins->lhs()), useRegister(ins->rhs()),
++                    LDefinition::BogusTemp(), LDefinition::BogusTemp());
++  assignSnapshot(lir, ins->bailoutKind());
++  define(lir, ins);
++}
++
++void LIRGeneratorPPC64::lowerBigIntPtrMod(MBigIntPtrMod* ins) {
++  auto* lir = new (alloc())
++      LBigIntPtrMod(useRegister(ins->lhs()), useRegister(ins->rhs()), temp(),
++                    LDefinition::BogusTemp());
++  if (ins->canBeDivideByZero()) {
++    assignSnapshot(lir, ins->bailoutKind());
++  }
++  define(lir, ins);
++}
++
++void LIRGeneratorPPC64::lowerBigIntPtrLsh(MBigIntPtrLsh* ins) {
++  auto* lir = new (alloc()) LBigIntPtrLsh(
++      useRegister(ins->lhs()), useRegister(ins->rhs()), temp(), temp());
++  assignSnapshot(lir, ins->bailoutKind());
++  define(lir, ins);
++}
++
++void LIRGeneratorPPC64::lowerBigIntPtrRsh(MBigIntPtrRsh* ins) {
++  auto* lir = new (alloc()) LBigIntPtrRsh(
++      useRegister(ins->lhs()), useRegister(ins->rhs()), temp(), temp());
++  assignSnapshot(lir, ins->bailoutKind());
++  define(lir, ins);
++}
++
++void LIRGeneratorPPC64::lowerTruncateDToInt32(MTruncateToInt32* ins) {
++  MDefinition* opd = ins->input();
++  MOZ_ASSERT(opd->type() == MIRType::Double);
++  define(new (alloc()) LTruncateDToInt32(useRegister(opd), tempDouble()), ins);
++}
++
++void LIRGeneratorPPC64::lowerTruncateFToInt32(MTruncateToInt32* ins) {
++  MDefinition* opd = ins->input();
++  MOZ_ASSERT(opd->type() == MIRType::Float32);
++  define(new (alloc()) LTruncateFToInt32(useRegister(opd), tempFloat32()), ins);
++}
++
++void LIRGeneratorPPC64::lowerBuiltinInt64ToFloatingPoint(
++    MBuiltinInt64ToFloatingPoint* ins) {
++  MOZ_CRASH("We don't use it for this architecture");
++}
++
++void LIRGeneratorPPC64::lowerWasmSelectI(MWasmSelect* select) {
++  auto* lir = new (alloc())
++      LWasmSelect(useRegisterAtStart(select->trueExpr()),
++                  useAny(select->falseExpr()), useRegister(select->condExpr()));
++  defineReuseInput(lir, select, LWasmSelect::TrueExprIndex);
++}
++
++void LIRGeneratorPPC64::lowerWasmSelectI64(MWasmSelect* select) {
++  auto* lir = new (alloc()) LWasmSelectI64(
++      useInt64RegisterAtStart(select->trueExpr()),
++      useInt64(select->falseExpr()), useRegister(select->condExpr()));
++  defineInt64ReuseInput(lir, select, LWasmSelectI64::TrueExprIndex);
++}
++
++void LIRGeneratorPPC64::lowerWasmBuiltinTruncateToInt32(
++    MWasmBuiltinTruncateToInt32* ins) {
++  MDefinition* opd = ins->input();
++  MOZ_ASSERT(opd->type() == MIRType::Double || opd->type() == MIRType::Float32);
++
++  if (opd->type() == MIRType::Double) {
++    define(new (alloc()) LWasmBuiltinTruncateDToInt32(
++               useRegister(opd), useFixed(ins->instance(), InstanceReg),
++               LDefinition::BogusTemp()),
++           ins);
++    return;
++  }
++
++  define(new (alloc()) LWasmBuiltinTruncateFToInt32(
++             useRegister(opd), useFixed(ins->instance(), InstanceReg),
++             LDefinition::BogusTemp()),
++         ins);
++}
++
++void LIRGeneratorPPC64::lowerWasmBuiltinTruncateToInt64(
++    MWasmBuiltinTruncateToInt64* ins) {
++  MOZ_CRASH("We don't use it for this architecture");
++}
++
++void LIRGeneratorPPC64::lowerWasmBuiltinDivI64(MWasmBuiltinDivI64* div) {
++  MOZ_CRASH("We don't use runtime div for this architecture");
++}
++
++void LIRGeneratorPPC64::lowerWasmBuiltinModI64(MWasmBuiltinModI64* mod) {
++  MOZ_CRASH("We don't use runtime mod for this architecture");
++}
++
++void LIRGeneratorPPC64::lowerAtomicLoad64(MLoadUnboxedScalar* ins) {
++  const LUse elements = useRegister(ins->elements());
++  const LAllocation index =
++      useRegisterOrIndexConstant(ins->index(), ins->storageType());
++  auto* lir = new (alloc()) LAtomicLoad64(elements, index);
++  defineInt64(lir, ins);
++}
++
++void LIRGeneratorPPC64::lowerAtomicStore64(MStoreUnboxedScalar* ins) {
++  LUse elements = useRegister(ins->elements());
++  LAllocation index =
++      useRegisterOrIndexConstant(ins->index(), ins->writeType());
++  LInt64Allocation value = useInt64Register(ins->value());
++  add(new (alloc()) LAtomicStore64(elements, index, value), ins);
++}
++
++// ===============================================================
++// LIRGenerator::visit* implementations
++
++void LIRGenerator::visitBox(MBox* box) {
++  MDefinition* opd = box->getOperand(0);
++
++  if (opd->isConstant() && box->canEmitAtUses()) {
++    emitAtUses(box);
++    return;
++  }
++
++  if (opd->isConstant()) {
++    define(new (alloc()) LValue(opd->toConstant()->toJSValue()), box,
++           LDefinition(LDefinition::BOX));
++  } else {
++    LBox* ins = new (alloc()) LBox(useRegisterAtStart(opd), opd->type());
++    define(ins, box, LDefinition(LDefinition::BOX));
++  }
++}
++
++void LIRGenerator::visitUnbox(MUnbox* unbox) {
++  MDefinition* box = unbox->getOperand(0);
++  MOZ_ASSERT(box->type() == MIRType::Value);
++
++  LInstructionHelper<1, BOX_PIECES, 0>* lir;
++  if (IsFloatingPointType(unbox->type())) {
++    MOZ_ASSERT(unbox->type() == MIRType::Double);
++    lir = new (alloc()) LUnboxFloatingPoint(useBoxAtStart(box));
++  } else if (unbox->fallible()) {
++    lir = new (alloc()) LUnbox(useRegisterAtStart(box));
++  } else {
++    lir = new (alloc()) LUnbox(useAtStart(box));
++  }
++
++  if (unbox->fallible()) {
++    assignSnapshot(lir, unbox->bailoutKind());
++  }
++
++  define(lir, unbox);
++}
++
++void LIRGenerator::visitCopySign(MCopySign* ins) {
++  MDefinition* lhs = ins->lhs();
++  MDefinition* rhs = ins->rhs();
++
++  MOZ_ASSERT(IsFloatingPointType(lhs->type()));
++  MOZ_ASSERT(lhs->type() == rhs->type());
++  MOZ_ASSERT(lhs->type() == ins->type());
++
++  LInstructionHelper<1, 2, 0>* lir;
++  if (lhs->type() == MIRType::Double) {
++    lir = new (alloc()) LCopySignD();
++  } else {
++    lir = new (alloc()) LCopySignF();
++  }
++
++  lowerForFPU(lir, ins, lhs, rhs);
++}
++
++void LIRGenerator::visitExtendInt32ToInt64(MExtendInt32ToInt64* ins) {
++  defineInt64(
++      new (alloc()) LExtendInt32ToInt64(useRegisterAtStart(ins->input())), ins);
++}
++
++void LIRGenerator::visitSignExtendInt64(MSignExtendInt64* ins) {
++  defineInt64(new (alloc())
++                  LSignExtendInt64(useInt64RegisterAtStart(ins->input())),
++              ins);
++}
++
++void LIRGenerator::visitInt64ToFloatingPoint(MInt64ToFloatingPoint* ins) {
++  MDefinition* opd = ins->input();
++  MOZ_ASSERT(opd->type() == MIRType::Int64);
++  MOZ_ASSERT(IsFloatingPointType(ins->type()));
++  define(new (alloc()) LInt64ToFloatingPoint(useInt64Register(opd)), ins);
++}
++
++void LIRGenerator::visitSubstr(MSubstr* ins) {
++  LSubstr* lir = new (alloc())
++      LSubstr(useRegister(ins->string()), useRegister(ins->begin()),
++              useRegister(ins->length()), temp(), temp(), temp());
++  define(lir, ins);
++  assignSafepoint(lir, ins);
++}
++
++void LIRGenerator::visitReturnImpl(MDefinition* opd, bool isGenerator) {
++  MOZ_ASSERT(opd->type() == MIRType::Value);
++  LReturn* ins = new (alloc()) LReturn(isGenerator);
++  ins->setOperand(0, useFixed(opd, JSReturnReg));
++  add(ins);
++}
++void LIRGenerator::visitCompareExchangeTypedArrayElement(
++    MCompareExchangeTypedArrayElement* ins) {
++  MOZ_ASSERT(!Scalar::isFloatingType(ins->arrayType()));
++  MOZ_ASSERT(ins->elements()->type() == MIRType::Elements);
++  MOZ_ASSERT(ins->index()->type() == MIRType::IntPtr);
++
++  const LUse elements = useRegister(ins->elements());
++  const LAllocation index =
++      useRegisterOrIndexConstant(ins->index(), ins->arrayType());
++
++  if (Scalar::isBigIntType(ins->arrayType())) {
++    LInt64Allocation oldval = useInt64Register(ins->oldval());
++    LInt64Allocation newval = useInt64Register(ins->newval());
++
++    auto* lir = new (alloc())
++        LCompareExchangeTypedArrayElement64(elements, index, oldval, newval);
++    defineInt64(lir, ins);
++    return;
++  }
++
++  const LAllocation oldval = useRegister(ins->oldval());
++  const LAllocation newval = useRegister(ins->newval());
++
++  LDefinition outTemp = LDefinition::BogusTemp();
++  LDefinition valueTemp = LDefinition::BogusTemp();
++  LDefinition offsetTemp = LDefinition::BogusTemp();
++  LDefinition maskTemp = LDefinition::BogusTemp();
++
++  if (ins->arrayType() == Scalar::Uint32 && IsFloatingPointType(ins->type())) {
++    outTemp = temp();
++  }
++
++  if (Scalar::byteSize(ins->arrayType()) < 4) {
++    // PPC64 sub-word CAS uses lbarx/lharx + stbcx./sthcx. (POWER7+); only
++    // valueTemp is needed, to hold the extsb/extsh-canonicalised oldval
++    // for the 32-bit cmpw. offsetTemp/maskTemp are unused (no round-down
++    // + bit-isolate dance), and remain BogusTemp.
++    valueTemp = temp();
++  }
++
++  LCompareExchangeTypedArrayElement* lir = new (alloc())
++      LCompareExchangeTypedArrayElement(elements, index, oldval, newval,
++                                        outTemp, valueTemp, offsetTemp,
++                                        maskTemp);
++
++  define(lir, ins);
++}
++
++void LIRGenerator::visitAtomicExchangeTypedArrayElement(
++    MAtomicExchangeTypedArrayElement* ins) {
++  MOZ_ASSERT(ins->elements()->type() == MIRType::Elements);
++  MOZ_ASSERT(ins->index()->type() == MIRType::IntPtr);
++
++  const LUse elements = useRegister(ins->elements());
++  const LAllocation index =
++      useRegisterOrIndexConstant(ins->index(), ins->arrayType());
++
++  if (Scalar::isBigIntType(ins->arrayType())) {
++    LInt64Allocation value = useInt64Register(ins->value());
++
++    auto* lir = new (alloc())
++        LAtomicExchangeTypedArrayElement64(elements, index, value);
++    defineInt64(lir, ins);
++    return;
++  }
++
++  MOZ_ASSERT(ins->arrayType() <= Scalar::Uint32);
++
++  const LAllocation value = useRegister(ins->value());
++
++  LDefinition outTemp = LDefinition::BogusTemp();
++  LDefinition valueTemp = LDefinition::BogusTemp();
++  LDefinition offsetTemp = LDefinition::BogusTemp();
++  LDefinition maskTemp = LDefinition::BogusTemp();
++
++  if (ins->arrayType() == Scalar::Uint32) {
++    MOZ_ASSERT(ins->type() == MIRType::Double);
++    outTemp = temp();
++  }
++
++  // PPC64 sub-word atomic exchange uses lbarx/lharx + stbcx./sthcx. directly
++  // (POWER7+); valueTemp/offsetTemp/maskTemp are never read by the
++  // implementation (see MacroAssembler-ppc64.cpp's AtomicExchange template).
++  // Leave them as BogusTemp.
++
++  LAtomicExchangeTypedArrayElement* lir =
++      new (alloc()) LAtomicExchangeTypedArrayElement(
++          elements, index, value, outTemp, valueTemp, offsetTemp, maskTemp);
++
++  define(lir, ins);
++}
++
++void LIRGenerator::visitAtomicTypedArrayElementBinop(
++    MAtomicTypedArrayElementBinop* ins) {
++  MOZ_ASSERT(ins->arrayType() != Scalar::Uint8Clamped);
++  MOZ_ASSERT(!Scalar::isFloatingType(ins->arrayType()));
++  MOZ_ASSERT(ins->elements()->type() == MIRType::Elements);
++  MOZ_ASSERT(ins->index()->type() == MIRType::IntPtr);
++
++  const LUse elements = useRegister(ins->elements());
++  const LAllocation index =
++      useRegisterOrIndexConstant(ins->index(), ins->arrayType());
++
++  if (Scalar::isBigIntType(ins->arrayType())) {
++    LInt64Allocation value = useInt64Register(ins->value());
++    LInt64Definition temp = tempInt64();
++
++    if (ins->isForEffect()) {
++      auto* lir = new (alloc()) LAtomicTypedArrayElementBinopForEffect64(
++          elements, index, value, temp);
++      add(lir, ins);
++      return;
++    }
++
++    auto* lir = new (alloc())
++        LAtomicTypedArrayElementBinop64(elements, index, value, temp);
++    defineInt64(lir, ins);
++    return;
++  }
++
++  LAllocation value = useRegister(ins->value());
++  LDefinition valueTemp = LDefinition::BogusTemp();
++  LDefinition offsetTemp = LDefinition::BogusTemp();
++  LDefinition maskTemp = LDefinition::BogusTemp();
++
++  // PPC64 sub-word atomic-binop uses lbarx/lharx + stbcx./sthcx. (POWER7+).
++  // The fetch-op variant needs valueTemp to hold the post-op value being
++  // condition-stored (MacroAssembler-ppc64.cpp's AtomicFetchOp); the
++  // for-effect variant uses an internal scratch and needs no temps at
++  // all. offsetTemp/maskTemp are unused in either path.
++  if (Scalar::byteSize(ins->arrayType()) < 4 && !ins->isForEffect()) {
++    valueTemp = temp();
++  }
++
++  if (ins->isForEffect()) {
++    LAtomicTypedArrayElementBinopForEffect* lir =
++        new (alloc()) LAtomicTypedArrayElementBinopForEffect(
++            elements, index, value, valueTemp, offsetTemp, maskTemp);
++    add(lir, ins);
++    return;
++  }
++
++  LDefinition outTemp = LDefinition::BogusTemp();
++
++  if (ins->arrayType() == Scalar::Uint32 && IsFloatingPointType(ins->type())) {
++    outTemp = temp();
++  }
++
++  LAtomicTypedArrayElementBinop* lir =
++      new (alloc()) LAtomicTypedArrayElementBinop(
++          elements, index, value, outTemp, valueTemp, offsetTemp, maskTemp);
++  define(lir, ins);
++}
++void LIRGenerator::visitAsmJSLoadHeap(MAsmJSLoadHeap* ins) {
++  MDefinition* base = ins->base();
++  MOZ_ASSERT(base->type() == MIRType::Int32);
++
++  MDefinition* boundsCheckLimit = ins->boundsCheckLimit();
++  MOZ_ASSERT_IF(ins->needsBoundsCheck(),
++                boundsCheckLimit->type() == MIRType::Int32);
++
++  LAllocation baseAlloc = useRegisterAtStart(base);
++
++  LAllocation limitAlloc = ins->needsBoundsCheck()
++                               ? useRegisterAtStart(boundsCheckLimit)
++                               : LAllocation();
++
++  MOZ_ASSERT(!ins->hasMemoryBase());
++  auto* lir =
++      new (alloc()) LAsmJSLoadHeap(baseAlloc, limitAlloc, LAllocation());
++  define(lir, ins);
++}
++void LIRGenerator::visitAsmJSStoreHeap(MAsmJSStoreHeap* ins) {
++  MDefinition* base = ins->base();
++  MOZ_ASSERT(base->type() == MIRType::Int32);
++
++  MDefinition* boundsCheckLimit = ins->boundsCheckLimit();
++  MOZ_ASSERT_IF(ins->needsBoundsCheck(),
++                boundsCheckLimit->type() == MIRType::Int32);
++
++  LAllocation baseAlloc = useRegisterAtStart(base);
++
++  LAllocation limitAlloc = ins->needsBoundsCheck()
++                               ? useRegisterAtStart(boundsCheckLimit)
++                               : LAllocation();
++
++  MOZ_ASSERT(!ins->hasMemoryBase());
++  add(new (alloc()) LAsmJSStoreHeap(baseAlloc, useRegisterAtStart(ins->value()),
++                                    limitAlloc, LAllocation()),
++      ins);
++}
++void LIRGenerator::visitWasmLoad(MWasmLoad* ins) {
++  MDefinition* base = ins->base();
++  MOZ_ASSERT(base->type() == MIRType::Int32 || base->type() == MIRType::Int64);
++
++  LAllocation memoryBase =
++      ins->hasMemoryBase() ? LAllocation(useRegisterAtStart(ins->memoryBase()))
++                           : LGeneralReg(HeapReg);
++
++  LAllocation ptr = useRegisterAtStart(base);
++
++  LDefinition ptrCopy = LDefinition::BogusTemp();
++  if (ins->access().offset32()) {
++    ptrCopy = tempCopy(base, 0);
++  }
++
++  if (ins->type() == MIRType::Int64) {
++    auto* lir = new (alloc()) LWasmLoadI64(ptr, memoryBase, ptrCopy);
++    defineInt64(lir, ins);
++    return;
++  }
++
++  auto* lir = new (alloc()) LWasmLoad(ptr, memoryBase, ptrCopy);
++  define(lir, ins);
++}
++void LIRGenerator::visitWasmStore(MWasmStore* ins) {
++  MDefinition* base = ins->base();
++  MOZ_ASSERT(base->type() == MIRType::Int32 || base->type() == MIRType::Int64);
++
++  MDefinition* value = ins->value();
++  LAllocation memoryBase =
++      ins->hasMemoryBase() ? LAllocation(useRegisterAtStart(ins->memoryBase()))
++                           : LGeneralReg(HeapReg);
++
++  LAllocation baseAlloc = useRegisterAtStart(base);
++
++  LDefinition ptrCopy = LDefinition::BogusTemp();
++  if (ins->access().offset32()) {
++    ptrCopy = tempCopy(base, 0);
++  }
++
++  if (ins->access().type() == Scalar::Int64) {
++    LInt64Allocation valueAlloc = useInt64RegisterAtStart(value);
++    auto* lir =
++        new (alloc()) LWasmStoreI64(baseAlloc, valueAlloc, memoryBase, ptrCopy);
++    add(lir, ins);
++    return;
++  }
++
++  LAllocation valueAlloc = useRegisterAtStart(value);
++  auto* lir =
++      new (alloc()) LWasmStore(baseAlloc, valueAlloc, memoryBase, ptrCopy);
++  add(lir, ins);
++}
++void LIRGenerator::visitWasmTruncateToInt64(MWasmTruncateToInt64* ins) {
++  MDefinition* opd = ins->input();
++  MOZ_ASSERT(opd->type() == MIRType::Double || opd->type() == MIRType::Float32);
++
++  defineInt64(new (alloc()) LWasmTruncateToInt64(useRegister(opd)), ins);
++}
++void LIRGenerator::visitWasmUnsignedToDouble(MWasmUnsignedToDouble* ins) {
++  MOZ_ASSERT(ins->input()->type() == MIRType::Int32);
++  LWasmUint32ToDouble* lir =
++      new (alloc()) LWasmUint32ToDouble(useRegisterAtStart(ins->input()));
++  define(lir, ins);
++}
++void LIRGenerator::visitWasmUnsignedToFloat32(MWasmUnsignedToFloat32* ins) {
++  MOZ_ASSERT(ins->input()->type() == MIRType::Int32);
++  LWasmUint32ToFloat32* lir =
++      new (alloc()) LWasmUint32ToFloat32(useRegisterAtStart(ins->input()));
++  define(lir, ins);
++}
++void LIRGenerator::visitWasmCompareExchangeHeap(MWasmCompareExchangeHeap* ins) {
++  MDefinition* base = ins->base();
++  MOZ_ASSERT(base->type() == MIRType::Int32 || base->type() == MIRType::Int64);
++  LAllocation memoryBase = ins->hasMemoryBase()
++                               ? LAllocation(useRegister(ins->memoryBase()))
++                               : LGeneralReg(HeapReg);
++
++  if (ins->access().type() == Scalar::Int64) {
++    auto* lir = new (alloc()) LWasmCompareExchangeI64(
++        useRegister(base), useInt64Register(ins->oldValue()),
++        useInt64Register(ins->newValue()), memoryBase);
++    defineInt64(lir, ins);
++    return;
++  }
++
++  LDefinition valueTemp = LDefinition::BogusTemp();
++  LDefinition offsetTemp = LDefinition::BogusTemp();
++  LDefinition maskTemp = LDefinition::BogusTemp();
++
++  // PPC64 sub-word wasm CAS uses lbarx/lharx + stbcx./sthcx. (POWER7+);
++  // valueTemp holds the extsb/extsh-canonicalised oldval for cmpw, while
++  // offsetTemp/maskTemp are unused (no round-down + bit-isolate dance).
++  if (ins->access().byteSize() < 4) {
++    valueTemp = temp();
++  }
++
++  auto* lir = new (alloc())
++      LWasmCompareExchangeHeap(useRegister(base), useRegister(ins->oldValue()),
++                               useRegister(ins->newValue()), memoryBase,
++                               valueTemp, offsetTemp, maskTemp);
++
++  define(lir, ins);
++}
++void LIRGenerator::visitWasmAtomicExchangeHeap(MWasmAtomicExchangeHeap* ins) {
++  MDefinition* base = ins->base();
++  MOZ_ASSERT(base->type() == MIRType::Int32 || base->type() == MIRType::Int64);
++  LAllocation memoryBase = ins->hasMemoryBase()
++                               ? LAllocation(useRegister(ins->memoryBase()))
++                               : LGeneralReg(HeapReg);
++
++  if (ins->access().type() == Scalar::Int64) {
++    auto* lir = new (alloc()) LWasmAtomicExchangeI64(
++        useRegister(base), useInt64Register(ins->value()), memoryBase);
++    defineInt64(lir, ins);
++    return;
++  }
++
++  // PPC64 sub-word wasm atomic exchange uses lbarx/lharx + stbcx./sthcx.
++  // (POWER7+); valueTemp/offsetTemp/maskTemp are never read by the
++  // implementation (see MacroAssembler-ppc64.cpp's AtomicExchange template).
++  // Pass BogusTemp for all three.
++  LDefinition valueTemp = LDefinition::BogusTemp();
++  LDefinition offsetTemp = LDefinition::BogusTemp();
++  LDefinition maskTemp = LDefinition::BogusTemp();
++
++  auto* lir = new (alloc())
++      LWasmAtomicExchangeHeap(useRegister(base), useRegister(ins->value()),
++                              memoryBase, valueTemp, offsetTemp, maskTemp);
++  define(lir, ins);
++}
++void LIRGenerator::visitWasmAtomicBinopHeap(MWasmAtomicBinopHeap* ins) {
++  MDefinition* base = ins->base();
++  MOZ_ASSERT(base->type() == MIRType::Int32 || base->type() == MIRType::Int64);
++  LAllocation memoryBase = ins->hasMemoryBase()
++                               ? LAllocation(useRegister(ins->memoryBase()))
++                               : LGeneralReg(HeapReg);
++
++  if (ins->access().type() == Scalar::Int64) {
++    auto* lir = new (alloc())
++        LWasmAtomicBinopI64(useRegister(base), useInt64Register(ins->value()),
++                            memoryBase, tempInt64());
++    defineInt64(lir, ins);
++    return;
++  }
++
++  LDefinition valueTemp = LDefinition::BogusTemp();
++  LDefinition offsetTemp = LDefinition::BogusTemp();
++  LDefinition maskTemp = LDefinition::BogusTemp();
++
++  // PPC64 sub-word wasm atomic-binop uses lbarx/lharx + stbcx./sthcx.
++  // (POWER7+). The fetch-op variant needs valueTemp for the post-op value
++  // being condition-stored; the for-effect variant uses an internal
++  // scratch and needs no temps at all. offsetTemp/maskTemp are unused
++  // in either path.
++  if (ins->access().byteSize() < 4 && ins->hasUses()) {
++    valueTemp = temp();
++  }
++
++  if (!ins->hasUses()) {
++    LWasmAtomicBinopHeapForEffect* lir = new (alloc())
++        LWasmAtomicBinopHeapForEffect(useRegister(base),
++                                      useRegister(ins->value()), memoryBase,
++                                      valueTemp, offsetTemp, maskTemp);
++    add(lir, ins);
++    return;
++  }
++
++  auto* lir = new (alloc())
++      LWasmAtomicBinopHeap(useRegister(base), useRegister(ins->value()),
++                           memoryBase, valueTemp, offsetTemp, maskTemp);
++
++  define(lir, ins);
++}
++
++// SIMD lowering
++void LIRGenerator::visitWasmTernarySimd128(MWasmTernarySimd128* ins) {
++#ifdef ENABLE_WASM_SIMD
++  MOZ_ASSERT(ins->type() == MIRType::Simd128);
++  // useRegister for v0/v1 and useRegisterAtStart only for v2 — matches
++  // ARM64's V128Bitselect policy. defineReuseInput requires the reused
++  // input to be useRegisterAtStart and the others to remain alive
++  // (useRegister); reusing all three policies as useRegisterAtStart
++  // trips the allocator's "*def->output() != alloc" assertion because
++  // v0/v1 may then share the slot with the output.
++  LDefinition temp0 = LDefinition::BogusTemp();
++  if (ins->simdOp() == wasm::SimdOp::I32x4RelaxedDotI8x16I7x16AddS) {
++    temp0 = tempSimd128();
++  }
++  auto* lir = new (alloc()) LWasmTernarySimd128(
++      useRegister(ins->v0()), useRegister(ins->v1()),
++      useRegisterAtStart(ins->v2()), temp0,
++      ins->simdOp());
++  // The PPC64 visitor (CodeGenerator-ppc64.cpp:visitWasmTernarySimd128)
++  // emits the FMA / DOT_THEN_ADD chain with v2 as the implicit
++  // accumulator. defineReuseInput tells the allocator to put `dest`
++  // in v2's slot, eliminating the previous conditional moveSimd128.
++  defineReuseInput(lir, ins, LWasmTernarySimd128::V2Index);
++#else
++  MOZ_CRASH("No SIMD");
++#endif
++}
++void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
++#ifdef ENABLE_WASM_SIMD
++  MOZ_ASSERT(ins->type() == MIRType::Simd128);
++  LDefinition temp0 = LDefinition::BogusTemp();
++  LDefinition temp1 = LDefinition::BogusTemp();
++  // mulInt64x2 (i64x2.mul) routes through GPRs (mfvsrd/mulld/mtvsrd) and
++  // uses an internal ScratchSimd128 + GPR scratches; its FloatRegister
++  // temp1/temp2 parameters are inherited from the shared ARM64+PPC64
++  // signature but unused on PPC64. Only FP min/max need SIMD temps for
++  // the wasm NaN-canonicalisation dance.
++  if (ins->simdOp() == wasm::SimdOp::F32x4Min ||
++      ins->simdOp() == wasm::SimdOp::F32x4Max ||
++      ins->simdOp() == wasm::SimdOp::F64x2Min ||
++      ins->simdOp() == wasm::SimdOp::F64x2Max) {
++    temp0 = tempSimd128();
++    temp1 = tempSimd128();
++  }
++  auto* lir = new (alloc()) LWasmBinarySimd128(
++      useRegisterAtStart(ins->lhs()), useRegisterAtStart(ins->rhs()),
++      temp0, temp1, ins->simdOp());
++  define(lir, ins);
++#else
++  MOZ_CRASH("No SIMD");
++#endif
++}
++void LIRGenerator::visitWasmBinarySimd128WithConstant(
++    MWasmBinarySimd128WithConstant* ins) {
++#ifdef ENABLE_WASM_SIMD
++  MOZ_ASSERT(ins->type() == MIRType::Simd128);
++  auto* lir = new (alloc()) LWasmBinarySimd128WithConstant(
++      useRegisterAtStart(ins->lhs()), LDefinition::BogusTemp(), ins->rhs());
++  define(lir, ins);
++#else
++  MOZ_CRASH("No SIMD");
++#endif
++}
++void LIRGenerator::visitWasmShiftSimd128(MWasmShiftSimd128* ins) {
++#ifdef ENABLE_WASM_SIMD
++  MOZ_ASSERT(ins->type() == MIRType::Simd128);
++  MOZ_ASSERT(ins->rhs()->type() == MIRType::Int32);
++
++  if (ins->rhs()->isConstant()) {
++    int32_t shiftCountMask;
++    switch (ins->simdOp()) {
++      case wasm::SimdOp::I8x16Shl:
++      case wasm::SimdOp::I8x16ShrU:
++      case wasm::SimdOp::I8x16ShrS:
++        shiftCountMask = 7;
++        break;
++      case wasm::SimdOp::I16x8Shl:
++      case wasm::SimdOp::I16x8ShrU:
++      case wasm::SimdOp::I16x8ShrS:
++        shiftCountMask = 15;
++        break;
++      case wasm::SimdOp::I32x4Shl:
++      case wasm::SimdOp::I32x4ShrU:
++      case wasm::SimdOp::I32x4ShrS:
++        shiftCountMask = 31;
++        break;
++      case wasm::SimdOp::I64x2Shl:
++      case wasm::SimdOp::I64x2ShrU:
++      case wasm::SimdOp::I64x2ShrS:
++        shiftCountMask = 63;
++        break;
++      default:
++        MOZ_CRASH("Unexpected shift operation");
++    }
++    int32_t shiftCount = ins->rhs()->toConstant()->toInt32() & shiftCountMask;
++#ifdef DEBUG
++    js::wasm::ReportSimdAnalysis("shift -> constant shift");
++#endif
++    auto* lir = new (alloc())
++        LWasmConstantShiftSimd128(useRegisterAtStart(ins->lhs()), shiftCount);
++    define(lir, ins);
++  } else {
++#ifdef DEBUG
++    js::wasm::ReportSimdAnalysis("shift -> variable shift");
++#endif
++    auto* lir = new (alloc()) LWasmVariableShiftSimd128(
++        useRegisterAtStart(ins->lhs()), useRegisterAtStart(ins->rhs()));
++    define(lir, ins);
++  }
++#else
++  MOZ_CRASH("No SIMD");
++#endif
++}
++#ifdef ENABLE_WASM_SIMD
++// Helper: reconstruct raw Wasm byte lane indices from analyzed SimdShuffle.
++static SimdConstant ReconstructShuffleBytes(const SimdShuffle& s) {
++  int8_t bytes[16];
++  if (s.permuteOp) {
++    switch (*s.permuteOp) {
++      case SimdPermuteOp::MOVE:
++        for (int i = 0; i < 16; i++) bytes[i] = i;
++        return SimdConstant::CreateX16(bytes);
++      case SimdPermuteOp::PERMUTE_32x4: {
++        const int32_t* w = reinterpret_cast<const int32_t*>(s.control.bytes());
++        for (int i = 0; i < 4; i++)
++          for (int j = 0; j < 4; j++) bytes[i*4+j] = w[i]*4+j;
++        return SimdConstant::CreateX16(bytes);
++      }
++      case SimdPermuteOp::PERMUTE_16x8: {
++        const int16_t* h = reinterpret_cast<const int16_t*>(s.control.bytes());
++        for (int i = 0; i < 8; i++) {
++          int idx = h[i] & 0x7;
++          bytes[i*2] = idx*2;
++          bytes[i*2+1] = idx*2+1;
++        }
++        return SimdConstant::CreateX16(bytes);
++      }
++      case SimdPermuteOp::BROADCAST_8x16: {
++        int8_t lane = reinterpret_cast<const int8_t*>(s.control.bytes())[0];
++        for (int i = 0; i < 16; i++) bytes[i] = lane;
++        return SimdConstant::CreateX16(bytes);
++      }
++      case SimdPermuteOp::BROADCAST_16x8: {
++        int8_t lane = reinterpret_cast<const int8_t*>(s.control.bytes())[0];
++        for (int i = 0; i < 8; i++) {
++          bytes[i*2] = lane*2; bytes[i*2+1] = lane*2+1;
++        }
++        return SimdConstant::CreateX16(bytes);
++      }
++      case SimdPermuteOp::ROTATE_RIGHT_8x16: {
++        uint8_t shift = reinterpret_cast<const int8_t*>(s.control.bytes())[0];
++        for (int i = 0; i < 16; i++) bytes[i] = (i + shift) % 16;
++        return SimdConstant::CreateX16(bytes);
++      }
++      case SimdPermuteOp::SHIFT_RIGHT_8x16: {
++        uint8_t shift = reinterpret_cast<const int8_t*>(s.control.bytes())[0];
++        for (int i = 0; i < 16; i++) bytes[i] = (i+shift < 16) ? (i+shift) : 0;
++        return SimdConstant::CreateX16(bytes);
++      }
++      case SimdPermuteOp::SHIFT_LEFT_8x16: {
++        uint8_t shift = reinterpret_cast<const int8_t*>(s.control.bytes())[0];
++        for (int i = 0; i < 16; i++) bytes[i] = (i >= shift) ? (i-shift) : 0;
++        return SimdConstant::CreateX16(bytes);
++      }
++      case SimdPermuteOp::REVERSE_16x8:
++        // Reverse bytes within each 16-bit lane: [1,0,3,2,5,4,...]
++        for (int i = 0; i < 8; i++) {
++          bytes[i*2] = i*2+1; bytes[i*2+1] = i*2;
++        }
++        return SimdConstant::CreateX16(bytes);
++      case SimdPermuteOp::REVERSE_32x4:
++        // Reverse bytes within each 32-bit lane: [3,2,1,0,7,6,5,4,...]
++        for (int i = 0; i < 4; i++)
++          for (int j = 0; j < 4; j++) bytes[i*4+j] = i*4+(3-j);
++        return SimdConstant::CreateX16(bytes);
++      case SimdPermuteOp::REVERSE_64x2:
++        // Reverse bytes within each 64-bit lane: [7,6,5,4,3,2,1,0,15,...]
++        for (int i = 0; i < 2; i++)
++          for (int j = 0; j < 8; j++) bytes[i*8+j] = i*8+(7-j);
++        return SimdConstant::CreateX16(bytes);
++      default:
++        break;
++    }
++  }
++  // Handle SimdShuffleOp (two-operand patterns).
++  if (s.shuffleOp) {
++    switch (*s.shuffleOp) {
++      case SimdShuffleOp::CONCAT_RIGHT_SHIFT_8x16: {
++        // control[0] = suffix length. ARM64 uses 16-count as the EXT shift.
++        // Reconstruct raw byte indices: EXT(rhs, lhs, 16-count) =
++        // take (16-count) bytes from rhs end, then count bytes from lhs start.
++        uint8_t count = 16 - reinterpret_cast<const int8_t*>(s.control.bytes())[0];
++        for (int i = 0; i < 16; i++) {
++          int idx = i + count;
++          bytes[i] = (idx < 16) ? (idx + 16) : (idx - 16);
++        }
++        return SimdConstant::CreateX16(bytes);
++      }
++      case SimdShuffleOp::BLEND_8x16: {
++        // control has 0 (lhs) or -1 (rhs) per byte.
++        const int8_t* mask = reinterpret_cast<const int8_t*>(s.control.bytes());
++        for (int i = 0; i < 16; i++)
++          bytes[i] = mask[i] ? (i + 16) : i;
++        return SimdConstant::CreateX16(bytes);
++      }
++      case SimdShuffleOp::BLEND_16x8: {
++        const int16_t* mask = reinterpret_cast<const int16_t*>(s.control.bytes());
++        for (int i = 0; i < 8; i++) {
++          int base = mask[i] ? (i * 2 + 16) : (i * 2);
++          bytes[i * 2] = base;
++          bytes[i * 2 + 1] = base + 1;
++        }
++        return SimdConstant::CreateX16(bytes);
++      }
++#define INTERLEAVE(name, width, low_start, count) \
++      case SimdShuffleOp::name: { \
++        for (int i = 0; i < count; i++) { \
++          int lhsIdx = low_start + i * width; \
++          int rhsIdx = lhsIdx + 16; \
++          for (int j = 0; j < width; j++) { \
++            bytes[(i * 2) * width + j] = lhsIdx + j; \
++            bytes[(i * 2 + 1) * width + j] = rhsIdx + j; \
++          } \
++        } \
++        return SimdConstant::CreateX16(bytes); \
++      }
++      INTERLEAVE(INTERLEAVE_LOW_8x16, 1, 0, 8)
++      INTERLEAVE(INTERLEAVE_HIGH_8x16, 1, 8, 8)
++      INTERLEAVE(INTERLEAVE_LOW_16x8, 2, 0, 4)
++      INTERLEAVE(INTERLEAVE_HIGH_16x8, 2, 8, 4)
++      INTERLEAVE(INTERLEAVE_LOW_32x4, 4, 0, 2)
++      INTERLEAVE(INTERLEAVE_HIGH_32x4, 4, 8, 2)
++      INTERLEAVE(INTERLEAVE_LOW_64x2, 8, 0, 1)
++      INTERLEAVE(INTERLEAVE_HIGH_64x2, 8, 8, 1)
++#undef INTERLEAVE
++      default:
++        break;
++    }
++  }
++  // PERMUTE_8x16, SHUFFLE_BLEND_8x16, etc: control should have raw byte indices.
++  // Force to Int8x16 type to avoid assertions from mismatched types.
++  if (s.control.type() == SimdConstant::Int8x16) {
++    return s.control;
++  }
++  // Fallback: re-create as Int8x16 from raw bytes.
++  memcpy(bytes, s.control.bytes(), 16);
++  return SimdConstant::CreateX16(bytes);
++}
++
++#endif  // ENABLE_WASM_SIMD
++
++void LIRGenerator::visitWasmShuffleSimd128(MWasmShuffleSimd128* ins) {
++#ifdef ENABLE_WASM_SIMD
++  MOZ_ASSERT(ins->type() == MIRType::Simd128);
++  SimdShuffle s = ins->shuffle();
++  switch (s.opd) {
++    case SimdShuffle::Operand::LEFT:
++    case SimdShuffle::Operand::RIGHT: {
++      // Single-operand permute: the analysis has identified that only one
++      // input matters (the other is zero or unused).
++      LAllocation src;
++      if (s.opd == SimdShuffle::Operand::LEFT) {
++        src = useRegisterAtStart(ins->lhs());
++      } else {
++        src = useRegisterAtStart(ins->rhs());
++      }
++      auto* lir =
++          new (alloc()) LWasmPermuteSimd128(src, *s.permuteOp, s.control);
++      define(lir, ins);
++      break;
++    }
++    case SimdShuffle::Operand::BOTH:
++    case SimdShuffle::Operand::BOTH_SWAPPED: {
++      SimdConstant ctrl = ReconstructShuffleBytes(s);
++      LAllocation lhs, rhs;
++      if (s.opd == SimdShuffle::Operand::BOTH_SWAPPED) {
++        lhs = useRegisterAtStart(ins->rhs());
++        rhs = useRegisterAtStart(ins->lhs());
++      } else {
++        lhs = useRegisterAtStart(ins->lhs());
++        rhs = useRegisterAtStart(ins->rhs());
++      }
++      auto* lir = new (alloc()) LWasmShuffleSimd128(
++          lhs, rhs, *s.shuffleOp, ctrl);
++      define(lir, ins);
++      break;
++    }
++  }
++#else
++  MOZ_CRASH("No SIMD");
++#endif
++}
++void LIRGenerator::visitWasmReplaceLaneSimd128(MWasmReplaceLaneSimd128* ins) {
++#ifdef ENABLE_WASM_SIMD
++  MOZ_ASSERT(ins->type() == MIRType::Simd128);
++  if (ins->rhs()->type() == MIRType::Int64) {
++    auto* lir = new (alloc()) LWasmReplaceInt64LaneSimd128(
++        useRegisterAtStart(ins->lhs()), useInt64Register(ins->rhs()));
++    defineReuseInput(lir, ins, LWasmReplaceInt64LaneSimd128::LhsIndex);
++  } else {
++    auto* lir = new (alloc()) LWasmReplaceLaneSimd128(
++        useRegisterAtStart(ins->lhs()), useRegister(ins->rhs()));
++    defineReuseInput(lir, ins, LWasmReplaceLaneSimd128::LhsIndex);
++  }
++#else
++  MOZ_CRASH("No SIMD");
++#endif
++}
++void LIRGenerator::visitWasmScalarToSimd128(MWasmScalarToSimd128* ins) {
++#ifdef ENABLE_WASM_SIMD
++  MOZ_ASSERT(ins->type() == MIRType::Simd128);
++  if (ins->input()->type() == MIRType::Int64) {
++    auto* lir =
++        new (alloc()) LWasmInt64ToSimd128(useInt64RegisterAtStart(ins->input()));
++    define(lir, ins);
++  } else {
++    auto* lir =
++        new (alloc()) LWasmScalarToSimd128(useRegisterAtStart(ins->input()));
++    define(lir, ins);
++  }
++#else
++  MOZ_CRASH("No SIMD");
++#endif
++}
++void LIRGenerator::visitWasmUnarySimd128(MWasmUnarySimd128* ins) {
++#ifdef ENABLE_WASM_SIMD
++  MOZ_ASSERT(ins->type() == MIRType::Simd128);
++  auto* lir = new (alloc())
++      LWasmUnarySimd128(useRegisterAtStart(ins->input()),
++                        LDefinition::BogusTemp());
++  define(lir, ins);
++#else
++  MOZ_CRASH("No SIMD");
++#endif
++}
++#ifdef ENABLE_WASM_SIMD
++bool LIRGeneratorPPC64::canFoldReduceSimd128AndBranch(wasm::SimdOp op) {
++  switch (op) {
++    case wasm::SimdOp::V128AnyTrue:
++    case wasm::SimdOp::I8x16AllTrue:
++    case wasm::SimdOp::I16x8AllTrue:
++    case wasm::SimdOp::I32x4AllTrue:
++    case wasm::SimdOp::I64x2AllTrue:
++      return true;
++    default:
++      return false;
++  }
++}
++
++bool LIRGeneratorPPC64::canEmitWasmReduceSimd128AtUses(
++    MWasmReduceSimd128* ins) {
++  if (!ins->canEmitAtUses()) {
++    return false;
++  }
++  if (ins->type() != MIRType::Int32) {
++    return false;
++  }
++  if (!canFoldReduceSimd128AndBranch(ins->simdOp())) {
++    return false;
++  }
++  MUseIterator iter(ins->usesBegin());
++  if (iter == ins->usesEnd()) {
++    return true;
++  }
++  MNode* node = iter->consumer();
++  if (!node->isDefinition() || !node->toDefinition()->isTest()) {
++    return false;
++  }
++  iter++;
++  return iter == ins->usesEnd();
++}
++#endif  // ENABLE_WASM_SIMD
++
++void LIRGenerator::visitWasmReduceSimd128(MWasmReduceSimd128* ins) {
++#ifdef ENABLE_WASM_SIMD
++  if (canEmitWasmReduceSimd128AtUses(ins)) {
++    emitAtUses(ins);
++    return;
++  }
++  if (ins->type() == MIRType::Int64) {
++    auto* lir = new (alloc())
++        LWasmReduceSimd128ToInt64(useRegisterAtStart(ins->input()));
++    defineInt64(lir, ins);
++  } else {
++    auto* lir =
++        new (alloc()) LWasmReduceSimd128(useRegisterAtStart(ins->input()));
++    define(lir, ins);
++  }
++#else
++  MOZ_CRASH("No SIMD");
++#endif
++}
++void LIRGenerator::visitWasmLoadLaneSimd128(MWasmLoadLaneSimd128* ins) {
++#ifdef ENABLE_WASM_SIMD
++  LUse base = useRegisterAtStart(ins->base());
++  LUse inputUse = useRegisterAtStart(ins->value());
++  LAllocation memoryBase =
++      ins->hasMemoryBase() ? LAllocation(useRegisterAtStart(ins->memoryBase()))
++                           : LGeneralReg(HeapReg);
++  auto* lir = new (alloc()) LWasmLoadLaneSimd128(base, inputUse, memoryBase);
++  define(lir, ins);
++#else
++  MOZ_CRASH("No SIMD");
++#endif
++}
++void LIRGenerator::visitWasmStoreLaneSimd128(MWasmStoreLaneSimd128* ins) {
++#ifdef ENABLE_WASM_SIMD
++  LUse base = useRegisterAtStart(ins->base());
++  LUse input = useRegisterAtStart(ins->value());
++  LAllocation memoryBase =
++      ins->hasMemoryBase() ? LAllocation(useRegisterAtStart(ins->memoryBase()))
++                           : LGeneralReg(HeapReg);
++  auto* lir = new (alloc()) LWasmStoreLaneSimd128(base, input, memoryBase);
++  add(lir, ins);
++#else
++  MOZ_CRASH("No SIMD");
++#endif
++}
++
++// PPC64 specializes compare+select for {U,}Int32 / {U,}Int64 compare with
++// Int32 / Int64 result. The CodeGen visitor
++// (CodeGenerator-ppc64.cpp:visitWasmCompareAndSelect) emits
++// cmpw/cmplw/cmpd/cmpld + isel = 2 insns, replacing the ~5-7 insns the
++// generic path would emit (boolean materialization + test + isel). FP
++// specialization is not worthwhile — the generic FP select path already
++// runs faster than the specialized integer one and PPC64 lacks a true
++// fcsel equivalent (fsel only compares against zero).
++bool LIRGeneratorShared::canSpecializeWasmCompareAndSelect(
++    MCompare::CompareType compTy, MIRType insTy) {
++  const bool insOk = insTy == MIRType::Int32 || insTy == MIRType::Int64;
++  const bool cmpOk = compTy == MCompare::Compare_Int32 ||
++                     compTy == MCompare::Compare_UInt32 ||
++                     compTy == MCompare::Compare_Int64 ||
++                     compTy == MCompare::Compare_UInt64;
++  return insOk && cmpOk;
++}
++
++void LIRGeneratorShared::lowerWasmCompareAndSelect(MWasmSelect* ins,
++                                                   MDefinition* lhs,
++                                                   MDefinition* rhs,
++                                                   MCompare::CompareType compTy,
++                                                   JSOp jsop) {
++  MOZ_ASSERT(canSpecializeWasmCompareAndSelect(compTy, ins->type()));
++  auto* lir = new (alloc()) LWasmCompareAndSelect(
++      useRegister(lhs), useRegister(rhs), useRegisterAtStart(ins->trueExpr()),
++      useRegister(ins->falseExpr()), compTy, jsop);
++  defineReuseInput(lir, ins, LWasmCompareAndSelect::IfTrueExprIndex);
++}
++
++// MIR helpers needed by the linker
++#ifdef ENABLE_WASM_SIMD
++bool MWasmTernarySimd128::specializeBitselectConstantMaskAsShuffle(
++    int8_t shuffle[16]) {
++  return false;
++}
++#endif
++
++bool MWasmBinarySimd128::specializeForConstantRhs() { return false; }
++
++#ifdef ENABLE_WASM_SIMD
++bool MWasmTernarySimd128::canRelaxBitselect() { return false; }
++#endif
++
++#ifdef ENABLE_WASM_SIMD
++bool MWasmBinarySimd128::canPmaddubsw() { return false; }
++#endif
++
++}  // namespace jit
++}  // namespace js
+diff --git a/js/src/jit/ppc64/Lowering-ppc64.h b/js/src/jit/ppc64/Lowering-ppc64.h
+new file mode 100644
+index 000000000000..9c3519a7bb23
+--- /dev/null
++++ b/js/src/jit/ppc64/Lowering-ppc64.h
+@@ -0,0 +1,105 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_Lowering_ppc64_h
++#define jit_ppc64_Lowering_ppc64_h
++
++#include "jit/shared/Lowering-shared.h"
++
++namespace js {
++namespace jit {
++
++class LIRGeneratorPPC64 : public LIRGeneratorShared {
++ protected:
++  LIRGeneratorPPC64(MIRGenerator* gen, MIRGraph& graph, LIRGraph& lirGraph)
++      : LIRGeneratorShared(gen, graph, lirGraph) {}
++
++  LTableSwitch* newLTableSwitch(const LAllocation& in,
++                                const LDefinition& inputCopy);
++  LTableSwitchV* newLTableSwitchV(const LBoxAllocation& in);
++
++  void lowerForShift(LInstructionHelper<1, 2, 0>* ins, MDefinition* mir,
++                     MDefinition* lhs, MDefinition* rhs);
++  template <class LInstr>
++  void lowerForShiftInt64(LInstr* ins, MDefinition* mir, MDefinition* lhs,
++                          MDefinition* rhs);
++  void lowerForALU(LInstructionHelper<1, 1, 0>* ins, MDefinition* mir,
++                   MDefinition* input);
++  void lowerForALU(LInstructionHelper<1, 2, 0>* ins, MDefinition* mir,
++                   MDefinition* lhs, MDefinition* rhs);
++  void lowerForALUInt64(LInstructionHelper<INT64_PIECES, INT64_PIECES, 0>* ins,
++                        MDefinition* mir, MDefinition* input);
++  void lowerForALUInt64(
++      LInstructionHelper<INT64_PIECES, 2 * INT64_PIECES, 0>* ins,
++      MDefinition* mir, MDefinition* lhs, MDefinition* rhs);
++  void lowerForMulInt64(LMulI64* ins, MMul* mir, MDefinition* lhs,
++                        MDefinition* rhs);
++  void lowerForFPU(LInstructionHelper<1, 1, 0>* ins, MDefinition* mir,
++                   MDefinition* input);
++  void lowerForFPU(LInstructionHelper<1, 2, 0>* ins, MDefinition* mir,
++                   MDefinition* lhs, MDefinition* rhs);
++
++  template <size_t Temps>
++  void lowerForCompareI64(LInstructionHelper<1, 2 * INT64_PIECES, Temps>* lir,
++                          MDefinition* mir, MDefinition* lhs, MDefinition* rhs);
++
++  LBoxAllocation useBoxFixed(MDefinition* mir, Register reg1, Register reg2,
++                             bool useAtStart = false);
++
++  LAllocation useByteOpRegister(MDefinition* mir);
++  LAllocation useByteOpRegisterAtStart(MDefinition* mir);
++  LAllocation useByteOpRegisterOrNonDoubleConstant(MDefinition* mir);
++  LDefinition tempByteOpRegister();
++
++  LDefinition tempToUnbox();
++
++  bool needTempForPostBarrier() { return true; }
++
++  void lowerUntypedPhiInput(MPhi* phi, uint32_t inputPosition, LBlock* block,
++                            size_t lirIndex);
++  void lowerInt64PhiInput(MPhi* phi, uint32_t inputPosition, LBlock* block,
++                          size_t lirIndex);
++  void defineInt64Phi(MPhi* phi, size_t lirIndex);
++
++  void lowerMulI(MMul* mul, MDefinition* lhs, MDefinition* rhs);
++  void lowerDivI(MDiv* div);
++  void lowerDivI64(MDiv* div);
++  void lowerModI(MMod* mod);
++  void lowerModI64(MMod* mod);
++  void lowerUDiv(MDiv* div);
++  void lowerUDivI64(MDiv* div);
++  void lowerUMod(MMod* mod);
++  void lowerUModI64(MMod* mod);
++  void lowerUrshD(MUrsh* mir);
++  void lowerPowOfTwoI(MPow* mir);
++  void lowerBigIntPtrDiv(MBigIntPtrDiv* ins);
++  void lowerBigIntPtrMod(MBigIntPtrMod* ins);
++  void lowerBigIntPtrLsh(MBigIntPtrLsh* ins);
++  void lowerBigIntPtrRsh(MBigIntPtrRsh* ins);
++  void lowerTruncateDToInt32(MTruncateToInt32* ins);
++  void lowerTruncateFToInt32(MTruncateToInt32* ins);
++  void lowerBuiltinInt64ToFloatingPoint(MBuiltinInt64ToFloatingPoint* ins);
++  void lowerWasmSelectI(MWasmSelect* select);
++  void lowerWasmSelectI64(MWasmSelect* select);
++  void lowerWasmBuiltinTruncateToInt64(MWasmBuiltinTruncateToInt64* ins);
++  void lowerWasmBuiltinTruncateToInt32(MWasmBuiltinTruncateToInt32* ins);
++  void lowerWasmBuiltinDivI64(MWasmBuiltinDivI64* div);
++  void lowerWasmBuiltinModI64(MWasmBuiltinModI64* mod);
++  void lowerAtomicLoad64(MLoadUnboxedScalar* ins);
++  void lowerAtomicStore64(MStoreUnboxedScalar* ins);
++
++#ifdef ENABLE_WASM_SIMD
++  bool canFoldReduceSimd128AndBranch(wasm::SimdOp op);
++  bool canEmitWasmReduceSimd128AtUses(MWasmReduceSimd128* ins);
++#endif
++};
++
++typedef LIRGeneratorPPC64 LIRGeneratorSpecific;
++
++}  // namespace jit
++}  // namespace js
++
++#endif /* jit_ppc64_Lowering_ppc64_h */
+diff --git a/js/src/jit/ppc64/MacroAssembler-ppc64-inl.h b/js/src/jit/ppc64/MacroAssembler-ppc64-inl.h
+new file mode 100644
+index 000000000000..f82ca36b4e40
+--- /dev/null
++++ b/js/src/jit/ppc64/MacroAssembler-ppc64-inl.h
+@@ -0,0 +1,6142 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_MacroAssembler_ppc64_inl_h
++#define jit_ppc64_MacroAssembler_ppc64_inl_h
++
++#include "jit/ppc64/MacroAssembler-ppc64.h"
++
++namespace js {
++namespace jit {
++
++//{{{ check_macroassembler_style
++
++// ===============================================================
++// Move instructions
++
++void MacroAssembler::move64(Register64 src, Register64 dest) {
++  movePtr(src.reg, dest.reg);
++}
++
++void MacroAssembler::move64(Imm64 imm, Register64 dest) {
++  movePtr(ImmWord(imm.value), dest.reg);
++}
++
++void MacroAssembler::moveDoubleToGPR64(FloatRegister src, Register64 dest) {
++  as_mfvsrd(dest.reg, src);
++}
++
++void MacroAssembler::moveGPR64ToDouble(Register64 src, FloatRegister dest) {
++  as_mtvsrd(dest, src.reg);
++}
++
++void MacroAssembler::moveLowDoubleToGPR(FloatRegister src, Register dest) {
++  MOZ_CRASH("Not supported for this target");
++}
++
++void MacroAssembler::move64To32(Register64 src, Register dest) {
++  as_extsw(dest, src.reg);
++}
++
++void MacroAssembler::move32To64ZeroExtend(Register src, Register64 dest) {
++  // clrldi dest, src, 32 — clear upper 32 bits.
++  as_rldicl(dest.reg, src, 0, 32);
++}
++
++void MacroAssembler::move8To64SignExtend(Register src, Register64 dest) {
++  as_extsb(dest.reg, src);
++}
++
++void MacroAssembler::move16To64SignExtend(Register src, Register64 dest) {
++  as_extsh(dest.reg, src);
++}
++
++void MacroAssembler::move32To64SignExtend(Register src, Register64 dest) {
++  as_extsw(dest.reg, src);
++}
++
++void MacroAssembler::moveFloat32ToGPR(FloatRegister src, Register dest) {
++  // FPR holds double-format value (PPC convention). Convert to
++  // single-precision bits in bits 0:31 of the VSR, then extract.
++  as_xscvdpspn(ScratchDoubleReg, src);
++  as_mfvsrd(dest, ScratchDoubleReg);
++  x_srdi(dest, dest, 32);
++}
++
++void MacroAssembler::moveGPRToFloat32(Register src, FloatRegister dest) {
++  // Place raw single-precision bits in VSR bits 0:31, then convert
++  // to double-precision format (matching PPC's FPR convention, like lfs).
++  if (HasPOWER9()) {
++    // mtvsrws splats the 32-bit word to both halves of the VSR.
++    as_mtvsrws(dest, src);
++  } else {
++    // POWER8: shift GPR left 32 bits to place float bits in upper word,
++    // then move to VSR. xscvspdpn reads from bits 0:31.
++    UseScratchRegisterScope temps(*this);
++    Register tmp = temps.Acquire();
++    x_sldi(tmp, src, 32);
++    as_mtvsrd(dest, tmp);
++  }
++  as_xscvspdpn(dest, dest);
++}
++
++void MacroAssembler::moveFloat16ToGPR(FloatRegister src, Register dest) {
++  MOZ_ASSERT(HasPOWER9());
++  // src has FP16 in dw0 bits 48:63; rest of dw0 is 0 (per xscvdphp /
++  // lxsihzx / mtvsrwz contract). mfvsrd reads dw0 → dest = 0x...0000_HHHH.
++  // Mask defensively in case a future caller hands us a non-canonical FP16.
++  as_mfvsrd(dest, src);
++  as_rldicl(dest, dest, 0, 48);  // clrldi 48: keep low 16 bits
++}
++
++void MacroAssembler::moveGPRToFloat16(Register src, FloatRegister dest) {
++  MOZ_ASSERT(HasPOWER9());
++  // mtvsrwz zeros dw0 word 0 and copies src's low 32 to dw0 word 1; mask
++  // src to its low 16 first so dw0 bits 32:47 stay zero (canonical FP16).
++  UseScratchRegisterScope temps(*this);
++  Register scratch = temps.Acquire();
++  as_rldicl(scratch, src, 0, 48);  // clrldi 48: keep only low 16
++  as_mtvsrwz(dest, scratch);
++}
++
++void MacroAssembler::move8ZeroExtend(Register src, Register dest) {
++  // rlwinm dest, src, 0, 24, 31 — mask to low 8 bits.
++  as_rlwinm(dest, src, 0, 24, 31);
++}
++
++void MacroAssembler::move8SignExtend(Register src, Register dest) {
++  as_extsb(dest, src);
++}
++
++void MacroAssembler::move16SignExtend(Register src, Register dest) {
++  as_extsh(dest, src);
++}
++
++void MacroAssembler::move8SignExtendToPtr(Register src, Register dest) {
++  as_extsb(dest, src);
++}
++
++void MacroAssembler::move16SignExtendToPtr(Register src, Register dest) {
++  as_extsh(dest, src);
++}
++
++void MacroAssembler::move32SignExtendToPtr(Register src, Register dest) {
++  as_extsw(dest, src);
++}
++
++void MacroAssembler::move32ZeroExtendToPtr(Register src, Register dest) {
++  as_rldicl(dest, src, 0, 32);
++}
++
++// ===============================================================
++// Load instructions
++
++void MacroAssembler::load32SignExtendToPtr(const Address& src, Register dest) {
++  load32(src, dest);
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::loadAbiReturnAddress(Register dest) { xs_mflr(dest); }
++
++// ===============================================================
++// Logical instructions
++
++void MacroAssembler::not32(Register reg) {
++  x_not(reg, reg);
++  as_extsw(reg, reg);
++}
++
++void MacroAssembler::notPtr(Register reg) { x_not(reg, reg); }
++
++void MacroAssembler::andPtr(Register src, Register dest) {
++  as_and_(dest, dest, src);
++}
++
++// If `mask` is a non-zero, non-all-ones contiguous run of 1-bits in a
++// 32-bit value (LSB-numbering), set MB/ME to the BE bit positions
++// (PPC convention: bit 0 = MSB) needed by `rlwinm SH=0` and return true.
++// Otherwise return false. Run-time cost is at JIT emit time only.
++static inline bool IsContigMask32(uint32_t mask, unsigned& mb, unsigned& me) {
++  if (mask == 0 || mask == 0xFFFFFFFFu) return false;
++  unsigned tz = (unsigned)__builtin_ctz(mask);
++  uint32_t shifted = mask >> tz;
++  if ((shifted & (shifted + 1)) != 0) return false;  // Has a 0 between 1s.
++  unsigned width = 32 - (unsigned)__builtin_clz(shifted);
++  // LSB bits set: [tz, tz+width-1]. BE bits: [32-tz-width, 31-tz].
++  mb = 32 - tz - width;
++  me = 31 - tz;
++  return true;
++}
++
++// 64-bit contiguous-mask classification for AND-with-imm via PPC's
++// rotate-and-mask family (SH=0). On success, sets `lsb` (LSB-numbering
++// of lowest set bit) and `width` (number of contiguous 1-bits).
++// Caller picks the encoding:
++//   - lsb == 0:                low `width` bits set        → rldicl
++//   (mb6=64-width)
++//   - lsb + width == 64:       high `width` bits set       → rldicr
++//   (me6=width-1)
++//   - lsb + width <= 32:       contig mask within low 32   → rlwinm (zeros high
++//   32)
++//   - otherwise (mid-run mask straddling bit 32 with lsb>0): no SH=0 single
++//     insn fits, return false to fall back to scratch+and.
++static inline bool IsContigMask64(uint64_t mask, unsigned& lsb,
++                                  unsigned& width) {
++  if (mask == 0 || mask == ~uint64_t(0)) return false;
++  unsigned tz = (unsigned)__builtin_ctzll(mask);
++  uint64_t shifted = mask >> tz;
++  if ((shifted & (shifted + 1)) != 0) return false;  // Has a 0 between 1s.
++  width = 64 - (unsigned)__builtin_clzll(shifted);
++  lsb = tz;
++  return true;
++}
++
++void MacroAssembler::andPtr(Imm32 imm, Register dest) {
++  // andi. handles 16-bit unsigned immediates in 1 insn (sets CR0).
++  // For wider positive immediates, IsContigMask32 → rlwinm (1 insn,
++  // also sets CR0). NOTE: andPtr sign-extends Imm32 to 64-bit before
++  // ANDing, so contig-mask is only safe when the immediate is
++  // non-negative (high bit clear) — rlwinm always zeros the high 32.
++  uint32_t uimm = uint32_t(imm.value);
++  if (is_uintN(uimm, 16)) {
++    as_andi_rc(dest, dest, uimm);
++    return;
++  }
++  unsigned mb, me;
++  if (imm.value >= 0 && IsContigMask32(uimm, mb, me)) {
++    as_rlwinm_rc(dest, dest, 0, mb, me);
++    return;
++  }
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(ImmWord(uintptr_t(intptr_t(imm.value))), scratch);
++  as_and_(dest, dest, scratch);
++}
++
++void MacroAssembler::andPtr(Imm32 imm, Register src, Register dest) {
++  if (src != dest) {
++    xs_mr(dest, src);
++  }
++  andPtr(imm, dest);
++}
++
++void MacroAssembler::and64(Imm64 imm, Register64 dest) {
++  uint64_t u = imm.value;
++  // 16-bit unsigned → andi. (1 insn).
++  if (u <= 0xFFFFu) {
++    as_andi_rc(dest.reg, dest.reg, uint16_t(u));
++    return;
++  }
++  unsigned lsb, width;
++  if (IsContigMask64(u, lsb, width)) {
++    if (lsb == 0) {
++      // low `width` bits set: rldicl SH=0 MB=64-width.
++      as_rldicl_rc(dest.reg, dest.reg, 0, 64 - width);
++      return;
++    }
++    if (lsb + width == 64) {
++      // high `width` bits set: rldicr SH=0 ME=width-1.
++      as_rldicr_rc(dest.reg, dest.reg, 0, width - 1);
++      return;
++    }
++    if (lsb + width <= 32) {
++      // contig mask within low 32: rlwinm SH=0 zeros bits 0..31 too.
++      // BE positions: mb = 32 - lsb - width, me = 31 - lsb.
++      as_rlwinm_rc(dest.reg, dest.reg, 0, 32 - lsb - width, 31 - lsb);
++      return;
++    }
++    // mid-run mask straddling bit 32 (lsb>0, lsb+width>32, lsb+width<64):
++    // not encodable as SH=0 mask. Fall through to scratch+and.
++  }
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(ImmWord(u), scratch);
++  as_and_(dest.reg, dest.reg, scratch);
++}
++
++void MacroAssembler::and64(Register64 src, Register64 dest) {
++  as_and_(dest.reg, dest.reg, src.reg);
++}
++
++void MacroAssembler::and32(Register src, Register dest) {
++  as_and_(dest, dest, src);
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::and32(Imm32 imm, Register dest) {
++  uint32_t uimm = uint32_t(imm.value);
++  if (is_uintN(uimm, 16)) {
++    as_andi_rc(dest, dest, uimm);
++  } else {
++    unsigned mb, me;
++    if (IsContigMask32(uimm, mb, me)) {
++      // rlwinm.SH=0 ANDs with the contiguous mask; record form sets CR0
++      // to match the side-effect of the andi. fast path above.
++      as_rlwinm_rc(dest, dest, 0, mb, me);
++    } else {
++      UseScratchRegisterScope temps(asMasm());
++      Register scratch = temps.Acquire();
++      move32(imm, scratch);
++      as_and_(dest, dest, scratch);
++    }
++  }
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::and32(Imm32 imm, Register src, Register dest) {
++  if (src != dest) {
++    xs_mr(dest, src);
++  }
++  and32(imm, dest);
++}
++
++void MacroAssembler::and32(Imm32 imm, const Address& dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load32(dest, scratch);
++  and32(imm, scratch);
++  store32(scratch, dest);
++}
++
++void MacroAssembler::and32(const Address& src, Register dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load32(src, scratch);
++  as_and_(dest, dest, scratch);
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::or64(Imm64 imm, Register64 dest) {
++  uint64_t u = imm.value;
++  // ori/oris zero-extend their immediates and don't touch other bits, so
++  // when imm fits in unsigned 32 (high 32 == 0) the pair handles it in
++  // 1-2 insns with no scratch.
++  if (u <= 0xFFFFFFFFu) {
++    uint16_t lo = uint16_t(u);
++    uint16_t hi = uint16_t(u >> 16);
++    if (hi == 0) {
++      as_ori(dest.reg, dest.reg, lo);
++    } else if (lo == 0) {
++      as_oris(dest.reg, dest.reg, hi);
++    } else {
++      as_ori(dest.reg, dest.reg, lo);
++      as_oris(dest.reg, dest.reg, hi);
++    }
++    return;
++  }
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(ImmWord(u), scratch);
++  as_or_(dest.reg, dest.reg, scratch);
++}
++
++void MacroAssembler::or32(Register src, Register dest) {
++  as_or_(dest, dest, src);
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::or32(Imm32 imm, Register dest) {
++  uint32_t uimm = uint32_t(imm.value);
++  uint16_t lo = uimm & 0xFFFF;
++  uint16_t hi = (uimm >> 16) & 0xFFFF;
++  if (hi == 0) {
++    as_ori(dest, dest, lo);
++  } else if (lo == 0) {
++    as_oris(dest, dest, hi);
++  } else {
++    // ori + oris pair handles arbitrary 32-bit unsigned imm in 2 insns
++    // without a scratch GPR. ori/oris are non-record forms (don't touch
++    // CR0), matching the behavior of the previous scratch+or_ path
++    // (or_ is the record form, but the value-only result is what callers
++    // observe through dest).
++    as_ori(dest, dest, lo);
++    as_oris(dest, dest, hi);
++  }
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::or32(Imm32 imm, Register src, Register dest) {
++  if (src != dest) {
++    xs_mr(dest, src);
++  }
++  or32(imm, dest);
++}
++
++void MacroAssembler::or32(Imm32 imm, const Address& dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load32(dest, scratch);
++  or32(imm, scratch);
++  store32(scratch, dest);
++}
++
++void MacroAssembler::xor64(Imm64 imm, Register64 dest) {
++  uint64_t u = imm.value;
++  // xori/xoris zero-extend their immediates; for unsigned-32-fit values
++  // they replace the scratch+xor sequence with 1-2 insns.
++  if (u <= 0xFFFFFFFFu) {
++    uint16_t lo = uint16_t(u);
++    uint16_t hi = uint16_t(u >> 16);
++    if (hi == 0) {
++      as_xori(dest.reg, dest.reg, lo);
++    } else if (lo == 0) {
++      as_xoris(dest.reg, dest.reg, hi);
++    } else {
++      as_xori(dest.reg, dest.reg, lo);
++      as_xoris(dest.reg, dest.reg, hi);
++    }
++    return;
++  }
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(ImmWord(u), scratch);
++  as_xor_(dest.reg, dest.reg, scratch);
++}
++
++void MacroAssembler::orPtr(Register src, Register dest) {
++  as_or_(dest, dest, src);
++}
++
++void MacroAssembler::orPtr(Imm32 imm, Register dest) {
++  uint32_t uimm = uint32_t(imm.value);
++  uint16_t lo = uimm & 0xFFFF;
++  uint16_t hi = (uimm >> 16) & 0xFFFF;
++  // ori/oris zero-extend their immediates, so for non-negative Imm32 (high
++  // 32 of sign-extended value = 0) we can use ori+oris to OR the full
++  // 32-bit pattern in 1-2 insns. Negative Imm32 sign-extends to set high
++  // bits 32..63 in the OR — those bits would be lost with ori+oris alone.
++  if (imm.value >= 0) {
++    if (hi == 0) {
++      as_ori(dest, dest, lo);
++    } else if (lo == 0) {
++      as_oris(dest, dest, hi);
++    } else {
++      as_ori(dest, dest, lo);
++      as_oris(dest, dest, hi);
++    }
++    return;
++  }
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(ImmWord(uintptr_t(intptr_t(imm.value))), scratch);
++  as_or_(dest, dest, scratch);
++}
++
++void MacroAssembler::orPtr(Imm32 imm, Register src, Register dest) {
++  if (src != dest) {
++    xs_mr(dest, src);
++  }
++  orPtr(imm, dest);
++}
++
++void MacroAssembler::or64(Register64 src, Register64 dest) {
++  as_or_(dest.reg, dest.reg, src.reg);
++}
++
++void MacroAssembler::xor64(Register64 src, Register64 dest) {
++  as_xor_(dest.reg, dest.reg, src.reg);
++}
++
++void MacroAssembler::xorPtr(Register src, Register dest) {
++  as_xor_(dest, dest, src);
++}
++
++void MacroAssembler::xorPtr(Imm32 imm, Register dest) {
++  uint32_t uimm = uint32_t(imm.value);
++  uint16_t lo = uimm & 0xFFFF;
++  uint16_t hi = (uimm >> 16) & 0xFFFF;
++  if (imm.value >= 0) {
++    if (hi == 0) {
++      as_xori(dest, dest, lo);
++    } else if (lo == 0) {
++      as_xoris(dest, dest, hi);
++    } else {
++      as_xori(dest, dest, lo);
++      as_xoris(dest, dest, hi);
++    }
++    return;
++  }
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(ImmWord(uintptr_t(intptr_t(imm.value))), scratch);
++  as_xor_(dest, dest, scratch);
++}
++
++void MacroAssembler::xorPtr(Imm32 imm, Register src, Register dest) {
++  if (src != dest) {
++    xs_mr(dest, src);
++  }
++  xorPtr(imm, dest);
++}
++
++void MacroAssembler::xor32(Register src, Register dest) {
++  as_xor_(dest, dest, src);
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::xor32(Imm32 imm, Register dest) {
++  uint32_t uimm = uint32_t(imm.value);
++  uint16_t lo = uimm & 0xFFFF;
++  uint16_t hi = (uimm >> 16) & 0xFFFF;
++  if (hi == 0) {
++    as_xori(dest, dest, lo);
++  } else if (lo == 0) {
++    as_xoris(dest, dest, hi);
++  } else {
++    // xori + xoris pair — 2 insns, no scratch GPR.
++    as_xori(dest, dest, lo);
++    as_xoris(dest, dest, hi);
++  }
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::xor32(Imm32 imm, Register src, Register dest) {
++  if (src != dest) {
++    xs_mr(dest, src);
++  }
++  xor32(imm, dest);
++}
++
++void MacroAssembler::xor32(Imm32 imm, const Address& dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load32(dest, scratch);
++  xor32(imm, scratch);
++  store32(scratch, dest);
++}
++
++void MacroAssembler::xor32(const Address& src, Register dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load32(src, scratch);
++  xor32(scratch, dest);
++}
++
++// ===============================================================
++// Swap instructions
++
++void MacroAssembler::byteSwap16SignExtend(Register reg) {
++  if (HasPOWER10()) {
++    // brh byte-reverses every halfword in reg; extsh keeps just the
++    // low halfword's byte-reversed value, sign-extended to 64 bits.
++    as_brh(reg, reg);
++    as_extsh(reg, reg);
++    return;
++  }
++  // POWER8/9: rotate-and-mask synthesis. Swap bytes in low halfword via
++  // (reg<<8)&0xFF00 | (reg>>8)&0xFF, then sign-extend.
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  as_rlwinm(scratch, reg, 8, 16, 23);  // scratch = (reg<<8) & 0xFF00
++  as_rlwinm(reg, reg, 24, 24, 31);     // reg = (reg>>8) & 0xFF
++  as_or_(reg, reg, scratch);
++  as_extsh(reg, reg);
++}
++
++void MacroAssembler::byteSwap16ZeroExtend(Register reg) {
++  if (HasPOWER10()) {
++    // brh byte-reverses every halfword; rldicl with sh=0,mb=48 zeroes
++    // the upper 48 bits — no CR0 side effect (vs andi.).
++    as_brh(reg, reg);
++    as_rldicl(reg, reg, 0, 48);
++    return;
++  }
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  // Both rlwinm forms zero-extend the 64-bit destination per ISA v3.0B
++  // (mask M = MASK(MB+32, ME+32) is 0 above bit 31), so after the OR the
++  // upper 48 bits are already zero — no follow-up clearing needed.
++  as_rlwinm(scratch, reg, 8, 16, 23);
++  as_rlwinm(reg, reg, 24, 24, 31);
++  as_or_(reg, reg, scratch);
++}
++
++void MacroAssembler::byteSwap32(Register reg) {
++  if (HasPOWER10()) {
++    // brw byte-reverses both 32-bit halves; extsw drops the upper half
++    // and sign-extends the byte-reversed low word to 64 bits.
++    as_brw(reg, reg);
++    as_extsw(reg, reg);
++    return;
++  }
++  // POWER8/9: rotate-with-insert synthesis (4 insns).
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  // scratch = rotate reg left 8, mask bytes 0,3
++  as_rlwinm(scratch, reg, 8, 0, 31);    // rotl32 by 8
++  as_rlwimi(scratch, reg, 24, 0, 7);    // insert byte 0
++  as_rlwimi(scratch, reg, 24, 16, 23);  // insert byte 2
++  // Sign-extend to 64 bits (as 32-bit value).
++  as_extsw(reg, scratch);
++}
++
++void MacroAssembler::byteSwap64(Register64 reg64) {
++  if (HasPOWER10()) {
++    // 1 insn, no FPR round-trip.
++    as_brd(reg64.reg, reg64.reg);
++  } else if (HasPOWER9()) {
++    as_mtvsrd(ScratchDoubleReg, reg64.reg);
++    as_xxbrd(ScratchDoubleReg, ScratchDoubleReg);
++    as_mfvsrd(reg64.reg, ScratchDoubleReg);
++  } else {
++    // POWER8: byte-swap via stack using stwbrx (word byte-reverse store).
++    // stwbrx RS,RA,RB stores RS byte-reversed at RA+RB.
++    // For 64-bit swap: store high word reversed at addr+0, low word at addr+4.
++    Register r = reg64.reg;
++    UseScratchRegisterScope temps(*this);
++    Register tmp = temps.Acquire();
++    as_stdu(StackPointer, StackPointer, -16);
++    // Store low 32 bits byte-reversed at SP+12.
++    as_addi(tmp, StackPointer, 12);
++    as_stwbrx(r, r0, tmp);  // r0 as RA = 0, so addr = tmp
++    // Store high 32 bits byte-reversed at SP+8.
++    x_srdi(r, r, 32);
++    as_addi(tmp, StackPointer, 8);
++    as_stwbrx(r, r0, tmp);  // addr = tmp
++    // Load reversed 64-bit value from SP+8.
++    as_ld(r, StackPointer, 8);
++    as_addi(StackPointer, StackPointer, 16);
++  }
++}
++
++// ===============================================================
++// Arithmetic functions
++
++void MacroAssembler::addPtr(Register src, Register dest) {
++  as_add(dest, dest, src);
++}
++
++void MacroAssembler::addPtr(Imm32 imm, Register dest) {
++  int32_t val = imm.value;
++  if (is_intN(val, 16)) {
++    as_addi(dest, dest, val);
++    return;
++  }
++  if (HasPOWER10()) {
++    // Imm32 always fits 34-bit signed; paddi does dest = dest + imm in one
++    // prefixed instruction with no scratch.
++    as_paddi(dest, dest, int64_t(val), /*R=*/false);
++    return;
++  }
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(ImmWord(int64_t(val)), scratch);
++  as_add(dest, dest, scratch);
++}
++
++void MacroAssembler::addPtr(ImmWord imm, Register dest) {
++  if (is_intN(int64_t(imm.value), 16)) {
++    as_addi(dest, dest, int16_t(imm.value));
++    return;
++  }
++  if (HasPOWER10() && is_intN((intptr_t)imm.value, 34)) {
++    as_paddi(dest, dest, (int64_t)imm.value, /*R=*/false);
++    return;
++  }
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(imm, scratch);
++  as_add(dest, dest, scratch);
++}
++
++void MacroAssembler::add64(Register64 src, Register64 dest) {
++  as_add(dest.reg, dest.reg, src.reg);
++}
++
++void MacroAssembler::add64(Imm32 imm, Register64 dest) {
++  addPtr(Imm32(imm.value), dest.reg);
++}
++
++void MacroAssembler::add64(Imm64 imm, Register64 dest) {
++  if (is_intN(int64_t(imm.value), 16)) {
++    as_addi(dest.reg, dest.reg, int16_t(imm.value));
++    return;
++  }
++  if (HasPOWER10() && is_intN((int64_t)imm.value, 34)) {
++    as_paddi(dest.reg, dest.reg, (int64_t)imm.value, /*R=*/false);
++    return;
++  }
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  MOZ_ASSERT(dest.reg != scratch);
++  movePtr(ImmWord(imm.value), scratch);
++  as_add(dest.reg, dest.reg, scratch);
++}
++
++void MacroAssembler::add32(Register src, Register dest) {
++  as_add(dest, dest, src);
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::add32(Imm32 imm, Register dest) {
++  if (is_intN(imm.value, 16)) {
++    as_addi(dest, dest, imm.value);
++  } else {
++    UseScratchRegisterScope temps(asMasm());
++    Register scratch = temps.Acquire();
++    move32(imm, scratch);
++    as_add(dest, dest, scratch);
++  }
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::add32(Imm32 imm, Register src, Register dest) {
++  move32(src, dest);
++  add32(imm, dest);
++}
++
++void MacroAssembler::add32(Imm32 imm, const Address& dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load32(dest, scratch);
++  add32(imm, scratch);
++  store32(scratch, dest);
++}
++
++void MacroAssembler::add32(const Address& src, Register dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load32(src, scratch);
++  as_add(dest, dest, scratch);
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::addPtr(Imm32 imm, const Address& dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(dest, scratch);
++  addPtr(imm, scratch);
++  storePtr(scratch, dest);
++}
++
++void MacroAssembler::addPtr(const Address& src, Register dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(src, scratch);
++  addPtr(scratch, dest);
++}
++
++void MacroAssembler::addDouble(FloatRegister src, FloatRegister dest) {
++  as_fadd(dest, dest, src);
++}
++
++void MacroAssembler::addFloat32(FloatRegister src, FloatRegister dest) {
++  as_fadds(dest, dest, src);
++}
++
++CodeOffset MacroAssembler::sub32FromStackPtrWithPatch(Register dest) {
++  CodeOffset offset = CodeOffset(currentOffset());
++  emitLoad64Stanza(dest, 0);
++  as_subf(dest, dest, StackPointer);
++  return offset;
++}
++
++void MacroAssembler::patchSub32FromStackPtr(CodeOffset offset, Imm32 imm) {
++  Instruction* inst = (Instruction*)editSrc(BufferOffset(offset.offset()));
++  UpdateLoad64Value(inst, uint64_t(int64_t(imm.value)));
++}
++
++void MacroAssembler::subPtr(Register src, Register dest) {
++  as_subf(dest, src, dest);
++}
++
++void MacroAssembler::subPtr(Imm32 imm, Register dest) {
++  if (is_intN(-int64_t(imm.value), 16)) {
++    as_addi(dest, dest, -imm.value);
++    return;
++  }
++  if (HasPOWER10()) {
++    // -Imm32 fits 34-bit signed (worst case -INT32_MIN = +2^31, well within
++    // ±2^33). paddi with the negated immediate does the subtract in 1 prefixed
++    // insn with no scratch.
++    as_paddi(dest, dest, -int64_t(imm.value), /*R=*/false);
++    return;
++  }
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(ImmWord(int64_t(imm.value)), scratch);
++  as_subf(dest, scratch, dest);
++}
++
++void MacroAssembler::sub64(Register64 src, Register64 dest) {
++  as_subf(dest.reg, src.reg, dest.reg);
++}
++
++void MacroAssembler::sub64(Imm64 imm, Register64 dest) {
++  if (is_intN(-int64_t(imm.value), 16)) {
++    as_addi(dest.reg, dest.reg, int16_t(-int64_t(imm.value)));
++    return;
++  }
++  if (HasPOWER10() && is_intN(-(int64_t)imm.value, 34)) {
++    as_paddi(dest.reg, dest.reg, -(int64_t)imm.value, /*R=*/false);
++    return;
++  }
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  MOZ_ASSERT(dest.reg != scratch);
++  movePtr(ImmWord(imm.value), scratch);
++  as_subf(dest.reg, scratch, dest.reg);
++}
++
++void MacroAssembler::sub32(Register src, Register dest) {
++  as_subf(dest, src, dest);
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::sub32(Imm32 imm, Register dest) {
++  if (is_intN(-int64_t(imm.value), 16)) {
++    as_addi(dest, dest, -imm.value);
++  } else {
++    UseScratchRegisterScope temps(asMasm());
++    Register scratch = temps.Acquire();
++    move32(imm, scratch);
++    as_subf(dest, scratch, dest);
++  }
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::sub32(const Address& src, Register dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load32(src, scratch);
++  as_subf(dest, scratch, dest);
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::subPtr(Register src, const Address& dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(dest, scratch);
++  as_subf(scratch, src, scratch);
++  storePtr(scratch, dest);
++}
++
++void MacroAssembler::subPtr(const Address& addr, Register dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(addr, scratch);
++  as_subf(dest, scratch, dest);
++}
++
++void MacroAssembler::subDouble(FloatRegister src, FloatRegister dest) {
++  as_fsub(dest, dest, src);
++}
++
++void MacroAssembler::subFloat32(FloatRegister src, FloatRegister dest) {
++  as_fsubs(dest, dest, src);
++}
++
++void MacroAssembler::mul64(const Register64& rhs, const Register64& srcDest) {
++  as_mulld(srcDest.reg, srcDest.reg, rhs.reg);
++}
++
++void MacroAssembler::mul64(Imm64 imm, const Register64& dest) {
++  if (is_intN(int64_t(imm.value), 16)) {
++    as_mulli(dest.reg, dest.reg, int16_t(imm.value));
++  } else {
++    UseScratchRegisterScope temps(asMasm());
++    Register scratch = temps.Acquire();
++    MOZ_ASSERT(dest.reg != scratch);
++    movePtr(ImmWord(imm.value), scratch);
++    as_mulld(dest.reg, dest.reg, scratch);
++  }
++}
++
++void MacroAssembler::mul64(Imm64 imm, const Register64& dest,
++                           const Register temp) {
++  MOZ_ASSERT(temp == Register::Invalid());
++  mul64(imm, dest);
++}
++
++void MacroAssembler::mul64(const Register64& src, const Register64& dest,
++                           const Register temp) {
++  MOZ_ASSERT(temp == Register::Invalid());
++  as_mulld(dest.reg, dest.reg, src.reg);
++}
++
++void MacroAssembler::mulPtr(Register rhs, Register srcDest) {
++  as_mulld(srcDest, srcDest, rhs);
++}
++
++void MacroAssembler::mulPtr(ImmWord rhs, Register srcDest) {
++  if (is_intN(int64_t(rhs.value), 16)) {
++    as_mulli(srcDest, srcDest, int16_t(rhs.value));
++    return;
++  }
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  MOZ_ASSERT(srcDest != scratch);
++  movePtr(rhs, scratch);
++  mulPtr(scratch, srcDest);
++}
++
++void MacroAssembler::mulBy3(Register src, Register dest) {
++  // mulli is the 16-bit-immediate form of mulld. 1 insn, no scratch,
++  // src==dest aliasing safe (RA read before RT write).
++  as_mulli(dest, src, 3);
++}
++
++void MacroAssembler::mul32(Register rhs, Register srcDest) {
++  as_mullw(srcDest, srcDest, rhs);
++  as_extsw(srcDest, srcDest);
++}
++
++void MacroAssembler::mul32(Imm32 imm, Register srcDest) {
++  if (is_intN(imm.value, 16)) {
++    as_mulli(srcDest, srcDest, imm.value);
++  } else {
++    UseScratchRegisterScope temps(asMasm());
++    Register scratch = temps.Acquire();
++    move32(imm, scratch);
++    as_mullw(srcDest, srcDest, scratch);
++  }
++  as_extsw(srcDest, srcDest);
++}
++
++void MacroAssembler::mulHighUnsigned32(Imm32 imm, Register src, Register dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  MOZ_ASSERT(src != scratch);
++  move32(imm, scratch);
++  as_mulhwu(dest, src, scratch);
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::mulFloat32(FloatRegister src, FloatRegister dest) {
++  as_fmuls(dest, dest, src);
++}
++
++void MacroAssembler::mulDouble(FloatRegister src, FloatRegister dest) {
++  as_fmul(dest, dest, src);
++}
++
++void MacroAssembler::mulDoublePtr(ImmPtr imm, Register temp,
++                                  FloatRegister dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(imm, scratch);
++  as_lfd(ScratchDoubleReg, scratch, 0);
++  as_fmul(dest, dest, ScratchDoubleReg);
++}
++
++void MacroAssembler::inc64(AbsoluteAddress dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register addrReg = temps.Acquire();
++  movePtr(ImmWord(uintptr_t(dest.addr)), addrReg);
++  Register scratch = SecondScratchReg;
++  as_ld(scratch, addrReg, 0);
++  as_addi(scratch, scratch, 1);
++  as_std(scratch, addrReg, 0);
++}
++
++void MacroAssembler::divFloat32(FloatRegister src, FloatRegister dest) {
++  as_fdivs(dest, dest, src);
++}
++
++void MacroAssembler::divDouble(FloatRegister src, FloatRegister dest) {
++  as_fdiv(dest, dest, src);
++}
++
++void MacroAssembler::quotient32(Register lhs, Register rhs, Register dest,
++                                bool isUnsigned) {
++  if (isUnsigned) {
++    as_divwu(dest, lhs, rhs);
++  } else {
++    as_divw(dest, lhs, rhs);
++  }
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::quotient64(Register lhs, Register rhs, Register dest,
++                                bool isUnsigned) {
++  if (isUnsigned) {
++    as_divdu(dest, lhs, rhs);
++  } else {
++    as_divd(dest, lhs, rhs);
++  }
++}
++
++void MacroAssembler::remainder32(Register lhs, Register rhs, Register dest,
++                                 bool isUnsigned) {
++  if (HasPOWER9()) {
++    if (isUnsigned) {
++      as_moduw(dest, lhs, rhs);
++    } else {
++      as_modsw(dest, lhs, rhs);
++    }
++  } else {
++    UseScratchRegisterScope temps(asMasm());
++    Register scratch = temps.Acquire();
++    if (isUnsigned) {
++      as_divwu(scratch, lhs, rhs);
++      as_mullw(scratch, scratch, rhs);
++    } else {
++      as_divw(scratch, lhs, rhs);
++      as_mullw(scratch, scratch, rhs);
++    }
++    as_subf(dest, scratch, lhs);
++  }
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::remainder64(Register lhs, Register rhs, Register dest,
++                                 bool isUnsigned) {
++  if (HasPOWER9()) {
++    if (isUnsigned) {
++      as_modud(dest, lhs, rhs);
++    } else {
++      as_modsd(dest, lhs, rhs);
++    }
++  } else {
++    UseScratchRegisterScope temps(asMasm());
++    Register scratch = temps.Acquire();
++    if (isUnsigned) {
++      as_divdu(scratch, lhs, rhs);
++      as_mulld(scratch, scratch, rhs);
++    } else {
++      as_divd(scratch, lhs, rhs);
++      as_mulld(scratch, scratch, rhs);
++    }
++    as_subf(dest, scratch, lhs);
++  }
++}
++
++void MacroAssembler::neg64(Register64 reg) { as_neg(reg.reg, reg.reg); }
++
++void MacroAssembler::negPtr(Register reg) { as_neg(reg, reg); }
++
++void MacroAssembler::neg32(Register reg) {
++  as_neg(reg, reg);
++  as_extsw(reg, reg);
++}
++
++void MacroAssembler::negateDouble(FloatRegister reg) { as_fneg(reg, reg); }
++
++void MacroAssembler::negateFloat(FloatRegister reg) { as_fneg(reg, reg); }
++
++void MacroAssembler::abs32(Register src, Register dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  as_srawi(scratch, src, 31);
++  as_xor_(dest, src, scratch);
++  as_subf(dest, scratch, dest);
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::absFloat32(FloatRegister src, FloatRegister dest) {
++  as_fabs(dest, src);
++}
++
++void MacroAssembler::absDouble(FloatRegister src, FloatRegister dest) {
++  as_fabs(dest, src);
++}
++
++void MacroAssembler::sqrtFloat32(FloatRegister src, FloatRegister dest) {
++  as_fsqrts(dest, src);
++}
++
++void MacroAssembler::sqrtDouble(FloatRegister src, FloatRegister dest) {
++  as_fsqrt(dest, src);
++}
++
++void MacroAssembler::min32(Register lhs, Register rhs, Register dest) {
++  as_cmpw(lhs, rhs);
++  // isel rt, ra, rb, cond: rt = (CR[cond] set) ? ra : rb
++  // LessThan set if lhs < rhs (signed), so pick lhs; else rhs = min.
++  as_isel(dest, lhs, rhs, LessThan, cr0);
++}
++
++void MacroAssembler::min32(Register lhs, Imm32 rhs, Register dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  move32(rhs, scratch);
++  min32(lhs, scratch, dest);
++}
++
++void MacroAssembler::max32(Register lhs, Register rhs, Register dest) {
++  as_cmpw(lhs, rhs);
++  // GT set if lhs > rhs (signed), so pick lhs; else rhs = max.
++  as_isel(dest, lhs, rhs, GreaterThan, cr0);
++}
++
++void MacroAssembler::max32(Register lhs, Imm32 rhs, Register dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  move32(rhs, scratch);
++  max32(lhs, scratch, dest);
++}
++
++void MacroAssembler::minPtr(Register lhs, Register rhs, Register dest) {
++  as_cmpd(lhs, rhs);
++  as_isel(dest, lhs, rhs, LessThan, cr0);
++}
++
++void MacroAssembler::minPtr(Register lhs, ImmWord rhs, Register dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(rhs, scratch);
++  minPtr(lhs, scratch, dest);
++}
++
++void MacroAssembler::maxPtr(Register lhs, Register rhs, Register dest) {
++  as_cmpd(lhs, rhs);
++  as_isel(dest, lhs, rhs, GreaterThan, cr0);
++}
++
++void MacroAssembler::maxPtr(Register lhs, ImmWord rhs, Register dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(rhs, scratch);
++  maxPtr(lhs, scratch, dest);
++}
++
++void MacroAssembler::minFloat32(FloatRegister other, FloatRegister srcDest,
++                                bool handleNaN) {
++  if (HasPOWER9()) {
++    // xsminjdp matches ECMA-262 Math.min semantics for ±0 and NaN.
++    // Float32 values are stored as doubles in PPC FPRs; the J-form
++    // result is bit-exact for values representable in float32 (which
++    // includes every NaN/±0/±Inf corner case JS observes). 1 insn.
++    as_xsminjdp(srcDest, srcDest, other);
++    return;
++  }
++  Label done, nan, equal;
++  as_fcmpu(srcDest, other);
++  if (handleNaN) {
++    ma_b(Assembler::DoubleUnordered, &nan);
++  }
++  // Handle +0 vs -0.
++  ma_b(Assembler::DoubleEqual, &equal);
++  ma_b(Assembler::DoubleLessThan, &done);
++  as_fmr(srcDest, other);
++  jump(&done);
++
++  bind(&equal);
++  // Both operands are equal. Check if they're zero.
++  loadConstantFloat32(0.0f, ScratchFloat32Reg);
++  as_fcmpu(srcDest, ScratchFloat32Reg);
++  // If not zero, they're identical; keep srcDest.
++  ma_b(Assembler::DoubleNotEqual, &done);
++  // Both are some combination of +0/-0. For min, result should be -0
++  // if either is -0: -((-srcDest) - other) gives -0 when either is -0.
++  as_fneg(ScratchFloat32Reg, srcDest);
++  as_fsubs(ScratchFloat32Reg, ScratchFloat32Reg, other);
++  as_fneg(srcDest, ScratchFloat32Reg);
++  jump(&done);
++
++  if (handleNaN) {
++    bind(&nan);
++    as_fadds(srcDest, srcDest, other);
++  }
++  bind(&done);
++}
++
++void MacroAssembler::minDouble(FloatRegister other, FloatRegister srcDest,
++                               bool handleNaN) {
++  if (HasPOWER9()) {
++    // xsminjdp matches ECMA-262 Math.min semantics exactly (covers
++    // 19 corner cases including ±0 and NaN). 1 insn vs ~12 for the
++    // fcmpu/branch fallback. POWER8 fallback follows.
++    as_xsminjdp(srcDest, srcDest, other);
++    return;
++  }
++  Label done, nan, equal;
++  as_fcmpu(srcDest, other);
++  if (handleNaN) {
++    ma_b(Assembler::DoubleUnordered, &nan);
++  }
++  // Handle +0 vs -0.
++  ma_b(Assembler::DoubleEqual, &equal);
++  ma_b(Assembler::DoubleLessThan, &done);
++  as_fmr(srcDest, other);
++  jump(&done);
++
++  bind(&equal);
++  loadConstantDouble(0.0, ScratchDoubleReg);
++  as_fcmpu(srcDest, ScratchDoubleReg);
++  ma_b(Assembler::DoubleNotEqual, &done);
++  // -((-srcDest) - other) gives -0 when either is -0.
++  as_fneg(ScratchDoubleReg, srcDest);
++  as_fsub(ScratchDoubleReg, ScratchDoubleReg, other);
++  as_fneg(srcDest, ScratchDoubleReg);
++  jump(&done);
++
++  if (handleNaN) {
++    bind(&nan);
++    as_fadd(srcDest, srcDest, other);
++  }
++  bind(&done);
++}
++
++void MacroAssembler::maxFloat32(FloatRegister other, FloatRegister srcDest,
++                                bool handleNaN) {
++  if (HasPOWER9()) {
++    // See minFloat32 above for the float32 ↔ J-form bit-exactness note.
++    as_xsmaxjdp(srcDest, srcDest, other);
++    return;
++  }
++  Label done, nan, equal;
++  as_fcmpu(srcDest, other);
++  if (handleNaN) {
++    ma_b(Assembler::DoubleUnordered, &nan);
++  }
++  // Handle +0 vs -0.
++  ma_b(Assembler::DoubleEqual, &equal);
++  ma_b(Assembler::DoubleGreaterThan, &done);
++  as_fmr(srcDest, other);
++  jump(&done);
++
++  bind(&equal);
++  loadConstantFloat32(0.0f, ScratchFloat32Reg);
++  as_fcmpu(srcDest, ScratchFloat32Reg);
++  ma_b(Assembler::DoubleNotEqual, &done);
++  // -0 + -0 = -0 and -0 + 0 = +0.
++  as_fadds(srcDest, srcDest, other);
++  jump(&done);
++
++  if (handleNaN) {
++    bind(&nan);
++    as_fadds(srcDest, srcDest, other);
++  }
++  bind(&done);
++}
++
++void MacroAssembler::maxDouble(FloatRegister other, FloatRegister srcDest,
++                               bool handleNaN) {
++  if (HasPOWER9()) {
++    // See minDouble above for the J-form semantics note.
++    as_xsmaxjdp(srcDest, srcDest, other);
++    return;
++  }
++  Label done, nan, equal;
++  as_fcmpu(srcDest, other);
++  if (handleNaN) {
++    ma_b(Assembler::DoubleUnordered, &nan);
++  }
++  // Handle +0 vs -0.
++  ma_b(Assembler::DoubleEqual, &equal);
++  ma_b(Assembler::DoubleGreaterThan, &done);
++  as_fmr(srcDest, other);
++  jump(&done);
++
++  bind(&equal);
++  loadConstantDouble(0.0, ScratchDoubleReg);
++  as_fcmpu(srcDest, ScratchDoubleReg);
++  ma_b(Assembler::DoubleNotEqual, &done);
++  // -0 + -0 = -0 and -0 + 0 = +0.
++  as_fadd(srcDest, srcDest, other);
++  jump(&done);
++
++  if (handleNaN) {
++    bind(&nan);
++    as_fadd(srcDest, srcDest, other);
++  }
++  bind(&done);
++}
++
++// ===============================================================
++// Shift functions
++
++void MacroAssembler::lshift32(Register src, Register dest) {
++  UseScratchRegisterScope temps(*this);
++  Register masked = temps.Acquire();
++  as_rlwinm(masked, src, 0, 27, 31);
++  as_slw(dest, dest, masked);
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::lshift32(Imm32 imm, Register dest) {
++  lshift32(imm, dest, dest);
++}
++
++void MacroAssembler::lshift32(Imm32 imm, Register src, Register dest) {
++  x_slwi(dest, src, imm.value & 0x1f);
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::flexibleLshift32(Register src, Register dest) {
++  lshift32(src, dest);
++}
++
++void MacroAssembler::lshift64(Register shift, Register64 dest) {
++  // PPC64 sld uses a 7-bit shift field; shifts >= 64 produce 0.
++  // Wasm i64.shl requires shift count modulo 64, so mask to 6 bits.
++  UseScratchRegisterScope temps(asMasm());
++  Register masked = temps.Acquire();
++  as_rldicl(masked, shift, 0, 58);  // clrldi: keep low 6 bits
++  as_sld(dest.reg, dest.reg, masked);
++}
++
++void MacroAssembler::lshift64(Imm32 imm, Register64 dest) {
++  MOZ_ASSERT(0 <= imm.value && imm.value < 64);
++  x_sldi(dest.reg, dest.reg, imm.value);
++}
++
++void MacroAssembler::lshiftPtr(Register shift, Register dest) {
++  as_sld(dest, dest, shift);
++}
++
++void MacroAssembler::lshiftPtr(Imm32 imm, Register dest) {
++  lshiftPtr(imm, dest, dest);
++}
++
++void MacroAssembler::lshiftPtr(Imm32 imm, Register src, Register dest) {
++  MOZ_ASSERT(0 <= imm.value && imm.value < 64);
++  x_sldi(dest, src, imm.value);
++}
++
++void MacroAssembler::flexibleLshiftPtr(Register shift, Register srcDest) {
++  lshiftPtr(shift, srcDest);
++}
++
++void MacroAssembler::rshift32(Register src, Register dest) {
++  UseScratchRegisterScope temps(*this);
++  Register masked = temps.Acquire();
++  as_rlwinm(masked, src, 0, 27, 31);
++  as_srw(dest, dest, masked);
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::rshift32(Imm32 imm, Register dest) {
++  rshift32(imm, dest, dest);
++}
++
++void MacroAssembler::rshift32(Imm32 imm, Register src, Register dest) {
++  x_srwi(dest, src, imm.value & 0x1f);
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::flexibleRshift32(Register src, Register dest) {
++  rshift32(src, dest);
++}
++
++void MacroAssembler::rshift32Arithmetic(Register src, Register dest) {
++  UseScratchRegisterScope temps(*this);
++  Register masked = temps.Acquire();
++  as_rlwinm(masked, src, 0, 27, 31);
++  as_sraw(dest, dest, masked);
++}
++
++void MacroAssembler::rshift32Arithmetic(Imm32 imm, Register dest) {
++  rshift32Arithmetic(imm, dest, dest);
++}
++
++void MacroAssembler::rshift32Arithmetic(Imm32 imm, Register src,
++                                        Register dest) {
++  as_srawi(dest, src, imm.value & 0x1f);
++}
++
++void MacroAssembler::flexibleRshift32Arithmetic(Register src, Register dest) {
++  rshift32Arithmetic(src, dest);
++}
++
++void MacroAssembler::rshift64(Register shift, Register64 dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register masked = temps.Acquire();
++  as_rldicl(masked, shift, 0, 58);
++  as_srd(dest.reg, dest.reg, masked);
++}
++
++void MacroAssembler::rshift64(Imm32 imm, Register64 dest) {
++  MOZ_ASSERT(0 <= imm.value && imm.value < 64);
++  x_srdi(dest.reg, dest.reg, imm.value);
++}
++
++void MacroAssembler::rshift64Arithmetic(Imm32 imm, Register64 dest) {
++  MOZ_ASSERT(0 <= imm.value && imm.value < 64);
++  as_sradi(dest.reg, dest.reg, imm.value);
++}
++
++void MacroAssembler::rshift64Arithmetic(Register shift, Register64 dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register masked = temps.Acquire();
++  as_rldicl(masked, shift, 0, 58);
++  as_srad(dest.reg, dest.reg, masked);
++}
++
++void MacroAssembler::rshiftPtr(Register shift, Register dest) {
++  as_srd(dest, dest, shift);
++}
++
++void MacroAssembler::rshiftPtr(Imm32 imm, Register dest) {
++  rshiftPtr(imm, dest, dest);
++}
++
++void MacroAssembler::rshiftPtr(Imm32 imm, Register src, Register dest) {
++  MOZ_ASSERT(0 <= imm.value && imm.value < 64);
++  x_srdi(dest, src, imm.value);
++}
++
++void MacroAssembler::flexibleRshiftPtr(Register shift, Register srcDest) {
++  rshiftPtr(shift, srcDest);
++}
++
++void MacroAssembler::rshiftPtrArithmetic(Imm32 imm, Register dest) {
++  rshiftPtrArithmetic(imm, dest, dest);
++}
++
++void MacroAssembler::rshiftPtrArithmetic(Imm32 imm, Register src,
++                                         Register dest) {
++  MOZ_ASSERT(0 <= imm.value && imm.value < 64);
++  as_sradi(dest, src, imm.value);
++}
++
++void MacroAssembler::rshiftPtrArithmetic(Register shift, Register dest) {
++  as_srad(dest, dest, shift);
++}
++
++void MacroAssembler::flexibleRshiftPtrArithmetic(Register shift,
++                                                 Register srcDest) {
++  rshiftPtrArithmetic(shift, srcDest);
++}
++
++// ===============================================================
++// Rotation functions
++
++void MacroAssembler::rotateLeft(Register count, Register input, Register dest) {
++  // PPC rotlw is rlwnm with full mask: rlwnm dest, input, count, 0, 31
++  as_rlwnm(dest, input, count, 0, 31);
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::rotateLeft(Imm32 count, Register input, Register dest) {
++  as_rlwinm(dest, input, count.value & 31, 0, 31);
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::rotateLeft64(Register count, Register64 src,
++                                  Register64 dest, Register temp) {
++  MOZ_ASSERT(temp == Register::Invalid());
++  // rldcl dest, src, count, 0 — rotate left doubleword then clear left 0 bits.
++  as_rldcl(dest.reg, src.reg, count, 0);
++}
++
++void MacroAssembler::rotateLeft64(Imm32 count, Register64 src, Register64 dest,
++                                  Register temp) {
++  MOZ_ASSERT(temp == Register::Invalid());
++  // rldicl dest, src, count, 0 — rotate left doubleword immediate then clear.
++  as_rldicl(dest.reg, src.reg, count.value & 63, 0);
++}
++
++void MacroAssembler::rotateRight(Register count, Register input,
++                                 Register dest) {
++  // rotateRight(n) = rotateLeft(32-n). When dest != input, the negated
++  // count can land directly in dest, dropping the GPR scratch. dest may
++  // alias count harmlessly (subfic reads count, then writes dest, then
++  // rlwnm consumes the new dest as its rotate-count).
++  if (dest != input) {
++    as_subfic(dest, count, 32);
++    as_rlwnm(dest, input, dest, 0, 31);
++    as_extsw(dest, dest);
++    return;
++  }
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  as_subfic(scratch, count, 32);
++  as_rlwnm(dest, input, scratch, 0, 31);
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::rotateRight(Imm32 count, Register input, Register dest) {
++  rotateLeft(Imm32((32 - count.value) & 31), input, dest);
++}
++
++void MacroAssembler::rotateRight64(Register count, Register64 src,
++                                   Register64 dest, Register temp) {
++  MOZ_ASSERT(temp == Register::Invalid());
++  // Same shape as rotateRight32: when dest != src, the negated count
++  // can land directly in dest, dropping the GPR scratch.
++  if (dest.reg != src.reg) {
++    as_subfic(dest.reg, count, 64);
++    as_rldcl(dest.reg, src.reg, dest.reg, 0);
++    return;
++  }
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  as_subfic(scratch, count, 64);
++  as_rldcl(dest.reg, src.reg, scratch, 0);
++}
++
++void MacroAssembler::rotateRight64(Imm32 count, Register64 src, Register64 dest,
++                                   Register temp) {
++  MOZ_ASSERT(temp == Register::Invalid());
++  rotateLeft64(Imm32((64 - count.value) & 63), src, dest, temp);
++}
++
++// ===============================================================
++// Bit counting functions
++
++void MacroAssembler::clz64(Register64 src, Register64 dest) {
++  as_cntlzd(dest.reg, src.reg);
++}
++
++void MacroAssembler::ctz64(Register64 src, Register64 dest) {
++  if (HasPOWER9()) {
++    as_cnttzd(dest.reg, src.reg);
++  } else {
++    UseScratchRegisterScope temps(*this);
++    Register tmp = temps.Acquire();
++    as_neg(tmp, src.reg);
++    // and. (record form) sets CR0[eq] based on result; result is 0 iff src==0,
++    // so this folds the explicit zero-check that would otherwise need cmpdi.
++    as_and__rc(tmp, src.reg, tmp);  // tmp = x & -x; CR0[eq] = (src == 0)
++    as_cntlzd(tmp, tmp);            // tmp = clz(isolated bit)
++    as_subfic(dest.reg, tmp, 63);   // dest = 63 - clz = ctz (for nonzero)
++    xs_li(tmp, 64);
++    as_isel(dest.reg, tmp, dest.reg, Equal);  // CR0[eq] → 64 if src==0
++  }
++}
++
++void MacroAssembler::popcnt64(Register64 input, Register64 output,
++                              Register tmp) {
++  as_popcntd(output.reg, input.reg);
++}
++
++void MacroAssembler::clz32(Register src, Register dest, bool knownNotZero) {
++  as_cntlzw(dest, src);
++}
++
++void MacroAssembler::ctz32(Register src, Register dest, bool knownNotZero) {
++  if (HasPOWER9()) {
++    as_cnttzw(dest, src);
++  } else {
++    UseScratchRegisterScope temps(*this);
++    Register tmp = temps.Acquire();
++    as_neg(tmp, src);
++    // and. record form folds the cmpwi src,0 that would otherwise be needed
++    // to drive the isel below: tmp == 0 iff src == 0.
++    if (knownNotZero) {
++      as_and_(tmp, src, tmp);
++    } else {
++      as_and__rc(tmp, src, tmp);  // CR0[eq] = (src == 0)
++    }
++    as_cntlzw(tmp, tmp);
++    as_subfic(dest, tmp, 31);
++    if (!knownNotZero) {
++      xs_li(tmp, 32);
++      as_isel(dest, tmp, dest, Equal);  // CR0[eq] → 32 if src==0
++    }
++  }
++}
++
++void MacroAssembler::popcnt32(Register input, Register output, Register tmp) {
++  as_popcntw(output, input);
++  // popcntw gives per-word results; on 64-bit the low word count is in
++  // bits 32:63, so just mask to 32 bits.
++  as_rlwinm(output, output, 0, 0, 31);
++}
++
++// ===============================================================
++// Condition functions
++
++void MacroAssembler::cmp8Set(Condition cond, Address lhs, Imm32 rhs,
++                             Register dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  MOZ_ASSERT(scratch != lhs.base);
++  bool isUnsigned = (cond & Assembler::ConditionUnsigned) != 0;
++  if (isUnsigned) {
++    load8ZeroExtend(lhs, scratch);
++    Condition c = ma_cmp(scratch, Imm32(uint8_t(rhs.value)), cond, true);
++    ma_cmp_set(dest, c);
++  } else {
++    load8SignExtend(lhs, scratch);
++    Condition c = ma_cmp(scratch, Imm32(int8_t(rhs.value)), cond, true);
++    ma_cmp_set(dest, c);
++  }
++}
++
++void MacroAssembler::cmp16Set(Condition cond, Address lhs, Imm32 rhs,
++                              Register dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  MOZ_ASSERT(scratch != lhs.base);
++  bool isUnsigned = (cond & Assembler::ConditionUnsigned) != 0;
++  if (isUnsigned) {
++    load16ZeroExtend(lhs, scratch);
++    Condition c = ma_cmp(scratch, Imm32(uint16_t(rhs.value)), cond, true);
++    ma_cmp_set(dest, c);
++  } else {
++    load16SignExtend(lhs, scratch);
++    Condition c = ma_cmp(scratch, Imm32(int16_t(rhs.value)), cond, true);
++    ma_cmp_set(dest, c);
++  }
++}
++
++template <typename T1, typename T2>
++void MacroAssembler::cmp32Set(Condition cond, T1 lhs, T2 rhs, Register dest) {
++  Condition c = ma_cmp(lhs, rhs, cond, true);
++  ma_cmp_set(dest, c);
++}
++
++void MacroAssembler::cmp64Set(Condition cond, Register64 lhs, Register64 rhs,
++                              Register dest) {
++  Condition c = ma_cmp(lhs.reg, rhs.reg, cond);
++  ma_cmp_set(dest, c);
++}
++
++void MacroAssembler::cmp64Set(Condition cond, Register64 lhs, Imm64 rhs,
++                              Register dest) {
++  Condition c = ma_cmp(lhs.reg, ImmWord(uint64_t(rhs.value)), cond);
++  ma_cmp_set(dest, c);
++}
++
++void MacroAssembler::cmp64Set(Condition cond, Address lhs, Register64 rhs,
++                              Register dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(lhs, scratch);
++  Condition c = ma_cmp(scratch, rhs.reg, cond);
++  ma_cmp_set(dest, c);
++}
++
++void MacroAssembler::cmp64Set(Condition cond, Address lhs, Imm64 rhs,
++                              Register dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(lhs, scratch);
++  Condition c = ma_cmp(scratch, ImmWord(uint64_t(rhs.value)), cond);
++  ma_cmp_set(dest, c);
++}
++
++template <typename T1, typename T2>
++void MacroAssembler::cmpPtrSet(Condition cond, T1 lhs, T2 rhs, Register dest) {
++  Condition c = ma_cmp(lhs, rhs, cond);
++  ma_cmp_set(dest, c);
++}
++
++// ===============================================================
++// Branch functions
++
++void MacroAssembler::branch8(Condition cond, const Address& lhs, Imm32 rhs,
++                             Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  // Mirror ARM64/LoongArch64/RISC-V: narrow the immediate to the 8-bit
++  // memory operand width so both sides of the compare have matching bit
++  // patterns regardless of how move32(Imm32) materializes the imm. Use
++  // uint8 cast for equality / unsigned, int8 cast for signed relational.
++  bool isEqOrNe = (cond == Assembler::Equal) || (cond == Assembler::NotEqual);
++  bool isUnsigned = (cond & Assembler::ConditionUnsigned) != 0;
++  Imm32 narrowed(0);
++  if (isEqOrNe || isUnsigned) {
++    load8ZeroExtend(lhs, scratch);
++    narrowed = Imm32(uint8_t(rhs.value));
++  } else {
++    load8SignExtend(lhs, scratch);
++    narrowed = Imm32(int8_t(rhs.value));
++  }
++  Condition c = ma_cmp(scratch, narrowed, cond, true);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branch8(Condition cond, const BaseIndex& lhs, Register rhs,
++                             Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load8ZeroExtend(lhs, scratch);
++  Condition c = ma_cmp(scratch, rhs, cond, true);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branch16(Condition cond, const Address& lhs, Imm32 rhs,
++                              Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  // See branch8: narrow the immediate to 16 bits so both sides have matching
++  // bit patterns. uint16 for equality / unsigned, int16 for signed relational.
++  bool isEqOrNe = (cond == Assembler::Equal) || (cond == Assembler::NotEqual);
++  bool isUnsigned = (cond & Assembler::ConditionUnsigned) != 0;
++  Imm32 narrowed(0);
++  if (isEqOrNe || isUnsigned) {
++    load16ZeroExtend(lhs, scratch);
++    narrowed = Imm32(uint16_t(rhs.value));
++  } else {
++    load16SignExtend(lhs, scratch);
++    narrowed = Imm32(int16_t(rhs.value));
++  }
++  Condition c = ma_cmp(scratch, narrowed, cond, true);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branch32(Condition cond, Register lhs, Register rhs,
++                              Label* label) {
++  Condition c = ma_cmp(lhs, rhs, cond, true);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branch32(Condition cond, Register lhs, Imm32 imm,
++                              Label* label) {
++  Condition c = ma_cmp(lhs, imm, cond, true);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branch32(Condition cond, const Address& lhs, Register rhs,
++                              Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load32(lhs, scratch);
++  Condition c = ma_cmp(scratch, rhs, cond, true);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branch32(Condition cond, const Address& lhs, Imm32 rhs,
++                              Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load32(lhs, scratch);
++  Condition c = ma_cmp(scratch, rhs, cond, true);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branch32(Condition cond, const AbsoluteAddress& lhs,
++                              Register rhs, Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(ImmWord((uintptr_t)lhs.addr), scratch);
++  load32(Address(scratch, 0), scratch);
++  Condition c = ma_cmp(scratch, rhs, cond, true);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branch32(Condition cond, const AbsoluteAddress& lhs,
++                              Imm32 rhs, Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(ImmWord((uintptr_t)lhs.addr), scratch);
++  load32(Address(scratch, 0), scratch);
++  Condition c = ma_cmp(scratch, rhs, cond, true);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branch32(Condition cond, const BaseIndex& lhs, Imm32 rhs,
++                              Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load32(lhs, scratch);
++  Condition c = ma_cmp(scratch, rhs, cond, true);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branch32(Condition cond, wasm::SymbolicAddress addr,
++                              Imm32 imm, Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(addr, scratch);
++  load32(Address(scratch, 0), scratch);
++  Condition c = ma_cmp(scratch, imm, cond, true);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branch64(Condition cond, Register64 lhs, Imm64 val,
++                              Label* success, Label* fail) {
++  Condition c = ma_cmp(lhs.reg, ImmWord(uint64_t(val.value)), cond);
++  if (fail) {
++    ma_b(c, success);
++    jump(fail);
++  } else {
++    ma_b(c, success);
++  }
++}
++
++void MacroAssembler::branch64(Condition cond, Register64 lhs, Register64 rhs,
++                              Label* success, Label* fail) {
++  Condition c = ma_cmp(lhs.reg, rhs.reg, cond);
++  if (fail) {
++    ma_b(c, success);
++    jump(fail);
++  } else {
++    ma_b(c, success);
++  }
++}
++
++void MacroAssembler::branch64(Condition cond, const Address& lhs, Imm64 val,
++                              Label* success, Label* fail) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(lhs, scratch);
++  Condition c = ma_cmp(scratch, ImmWord(uint64_t(val.value)), cond);
++  if (fail) {
++    ma_b(c, success);
++    jump(fail);
++  } else {
++    ma_b(c, success);
++  }
++}
++
++void MacroAssembler::branch64(Condition cond, const Address& lhs,
++                              Register64 rhs, Label* success, Label* fail) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(lhs, scratch);
++  Condition c = ma_cmp(scratch, rhs.reg, cond);
++  if (fail) {
++    ma_b(c, success);
++    jump(fail);
++  } else {
++    ma_b(c, success);
++  }
++}
++
++void MacroAssembler::branch64(Condition cond, const Address& lhs,
++                              const Address& rhs, Register scratch,
++                              Label* label) {
++  loadPtr(rhs, scratch);
++  branch64(cond, lhs, Register64(scratch), label, nullptr);
++}
++
++void MacroAssembler::branchPtr(Condition cond, Register lhs, Register rhs,
++                               Label* label) {
++  Condition c = ma_cmp(lhs, rhs, cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, Register lhs, Imm32 rhs,
++                               Label* label) {
++  Condition c = ma_cmp(lhs, ImmWord(int64_t(rhs.value)), cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, Register lhs, ImmPtr rhs,
++                               Label* label) {
++  Condition c = ma_cmp(lhs, rhs, cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, Register lhs, ImmGCPtr rhs,
++                               Label* label) {
++  Condition c = ma_cmp(lhs, rhs, cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, Register lhs, ImmWord rhs,
++                               Label* label) {
++  Condition c = ma_cmp(lhs, rhs, cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, const Address& lhs, Register rhs,
++                               Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(lhs, scratch);
++  Condition c = ma_cmp(scratch, rhs, cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, const Address& lhs, ImmPtr rhs,
++                               Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(lhs, scratch);
++  Condition c = ma_cmp(scratch, rhs, cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, const Address& lhs, ImmGCPtr rhs,
++                               Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(lhs, scratch);
++  Condition c = ma_cmp(scratch, rhs, cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, const Address& lhs, ImmWord rhs,
++                               Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(lhs, scratch);
++  Condition c = ma_cmp(scratch, rhs, cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, const AbsoluteAddress& lhs,
++                               Register rhs, Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(ImmWord((uintptr_t)lhs.addr), scratch);
++  loadPtr(Address(scratch, 0), scratch);
++  Condition c = ma_cmp(scratch, rhs, cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, const AbsoluteAddress& lhs,
++                               ImmWord rhs, Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(ImmWord((uintptr_t)lhs.addr), scratch);
++  loadPtr(Address(scratch, 0), scratch);
++  Condition c = ma_cmp(scratch, rhs, cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, wasm::SymbolicAddress lhs,
++                               Register rhs, Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(lhs, scratch);
++  loadPtr(Address(scratch, 0), scratch);
++  Condition c = ma_cmp(scratch, rhs, cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, const BaseIndex& lhs,
++                               Register rhs, Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(lhs, scratch);
++  Condition c = ma_cmp(scratch, rhs, cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, const BaseIndex& lhs,
++                               ImmWord rhs, Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(lhs, scratch);
++  Condition c = ma_cmp(scratch, rhs, cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchPrivatePtr(Condition cond, const Address& lhs,
++                                      Register rhs, Label* label) {
++  branchPtr(cond, lhs, rhs, label);
++}
++
++void MacroAssembler::branchFloat(DoubleCondition cond, FloatRegister lhs,
++                                 FloatRegister rhs, Label* label) {
++  as_fcmpu(lhs, rhs);
++  ma_b(cond, label);
++}
++
++void MacroAssembler::branchTruncateFloat32MaybeModUint32(FloatRegister src,
++                                                         Register dest,
++                                                         Label* fail) {
++  // Convert float32 to int64 (truncating toward zero), fail on NaN/overflow.
++  as_fctidz(ScratchDoubleReg, src);
++  as_mfvsrd(dest, ScratchDoubleReg);
++  // PPC64 fctidz saturates to INT64_MIN on negative overflow/NaN,
++  // and to INT64_MAX on positive overflow. Check both.
++  asMasm().branchPtr(Assembler::Equal, dest, ImmWord(int64_t(INT64_MIN)), fail);
++  asMasm().branchPtr(Assembler::Equal, dest, ImmWord(int64_t(INT64_MAX)), fail);
++  // Truncate to uint32 (keep low 32 bits).
++  as_rldicl(dest, dest, 0, 32);
++}
++
++void MacroAssembler::branchTruncateFloat32ToInt32(FloatRegister src,
++                                                  Register dest, Label* fail) {
++  convertFloat32ToInt32(src, dest, fail, false);
++}
++
++void MacroAssembler::branchDouble(DoubleCondition cond, FloatRegister lhs,
++                                  FloatRegister rhs, Label* label) {
++  as_fcmpu(lhs, rhs);
++  ma_b(cond, label);
++}
++
++void MacroAssembler::branchTruncateDoubleMaybeModUint32(FloatRegister src,
++                                                        Register dest,
++                                                        Label* fail) {
++  // Convert double to int64 (truncating toward zero), fail on NaN/overflow.
++  as_fctidz(ScratchDoubleReg, src);
++  as_mfvsrd(dest, ScratchDoubleReg);
++  // PPC64 fctidz saturates to INT64_MIN on negative overflow/NaN,
++  // and to INT64_MAX on positive overflow. Check both.
++  asMasm().branchPtr(Assembler::Equal, dest, ImmWord(int64_t(INT64_MIN)), fail);
++  asMasm().branchPtr(Assembler::Equal, dest, ImmWord(int64_t(INT64_MAX)), fail);
++  // Truncate to uint32 (keep low 32 bits).
++  as_rldicl(dest, dest, 0, 32);
++}
++
++void MacroAssembler::branchTruncateDoubleToInt32(FloatRegister src,
++                                                 Register dest, Label* fail) {
++  convertDoubleToInt32(src, dest, fail, false);
++}
++
++void MacroAssembler::branchInt64NotInPtrRange(Register64 src, Label* label) {
++  // No-op on 64-bit.
++}
++
++void MacroAssembler::branchUInt64NotInPtrRange(Register64 src, Label* label) {
++  // Branch if src >= 2^63 (sign bit set = out of signed ptr range).
++  as_cmpdi(src.reg, 0);
++  ma_b(Assembler::LessThan, label);
++}
++
++template <typename T>
++void MacroAssembler::branchAdd32(Condition cond, T src, Register dest,
++                                 Label* overflow) {
++  switch (cond) {
++    case Overflow: {
++      // Do raw 64-bit add (no sign extension) so we can detect 32-bit overflow.
++      // Both inputs should already be sign-extended 32-bit values, so the
++      // 64-bit result is mathematically correct. If extsw(result) != result,
++      // the 32-bit add overflowed.
++      UseScratchRegisterScope temps(asMasm());
++      Register scratch = temps.Acquire();
++      addPtr(src, dest);
++      as_extsw(scratch, dest);
++      as_cmpd(dest, scratch);
++      as_extsw(dest, dest);
++      ma_b(NotEqual, overflow);
++      break;
++    }
++    case NonZero:
++    case Zero:
++      add32(src, dest);
++      as_cmpdi(dest, 0);
++      ma_b(cond == NonZero ? NotEqual : Equal, overflow);
++      break;
++    case Signed:
++    case NotSigned:
++      add32(src, dest);
++      as_cmpdi(dest, 0);
++      ma_b(cond == Signed ? LessThan : GreaterThanOrEqual, overflow);
++      break;
++    case CarryClear:
++    case CarrySet: {
++      // Unsigned 32-bit carry detection: save dest, do 32-bit add,
++      // then unsigned-compare result with original. If result < original
++      // (unsigned), a carry occurred.
++      UseScratchRegisterScope temps(asMasm());
++      Register scratch = temps.Acquire();
++      move32(dest, scratch);
++      add32(src, dest);
++      as_cmplw(dest, scratch);
++      ma_b(cond == CarrySet ? LessThan : GreaterThanOrEqual, overflow);
++      break;
++    }
++    default:
++      MOZ_CRASH("NYI");
++  }
++}
++
++template <typename T>
++void MacroAssembler::branchSub32(Condition cond, T src, Register dest,
++                                 Label* overflow) {
++  switch (cond) {
++    case Overflow: {
++      // Do raw 64-bit sub (no sign extension) so we can detect 32-bit overflow.
++      UseScratchRegisterScope temps(asMasm());
++      Register scratch = temps.Acquire();
++      subPtr(src, dest);
++      as_extsw(scratch, dest);
++      as_cmpd(dest, scratch);
++      as_extsw(dest, dest);
++      ma_b(NotEqual, overflow);
++      break;
++    }
++    case NonZero:
++    case Zero:
++      sub32(src, dest);
++      as_cmpdi(dest, 0);
++      ma_b(cond == NonZero ? NotEqual : Equal, overflow);
++      break;
++    case Signed:
++    case NotSigned:
++      sub32(src, dest);
++      as_cmpdi(dest, 0);
++      ma_b(cond == Signed ? LessThan : GreaterThanOrEqual, overflow);
++      break;
++    default:
++      MOZ_CRASH("NYI");
++  }
++}
++
++template <typename T>
++void MacroAssembler::branchMul32(Condition cond, T src, Register dest,
++                                 Label* overflow) {
++  MOZ_ASSERT(cond == Overflow);
++  // Do raw 64-bit multiply (no sign extension) so we can detect 32-bit
++  // overflow. as_mulld gives full 64-bit low result; if extsw(result) !=
++  // result, overflow. scratch is dead after the mulld (consumed as RB),
++  // so the sign-extension round-trip reuses it instead of acquiring a
++  // second scratch.
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  move32(src, scratch);
++  as_mulld(dest, dest, scratch);
++  as_extsw(scratch, dest);
++  as_cmpd(dest, scratch);
++  as_extsw(dest, dest);
++  ma_b(NotEqual, overflow);
++}
++
++template <typename T>
++void MacroAssembler::branchRshift32(Condition cond, T src, Register dest,
++                                    Label* label) {
++  MOZ_ASSERT(cond == Zero || cond == NonZero);
++  rshift32(src, dest);
++  branch32(cond == Zero ? Equal : NotEqual, dest, Imm32(0), label);
++}
++
++void MacroAssembler::branchNeg32(Condition cond, Register reg, Label* label) {
++  MOZ_ASSERT(cond == Overflow);
++  neg32(reg);
++  branch32(Equal, reg, Imm32(INT32_MIN), label);
++}
++
++template <typename T>
++void MacroAssembler::branchAddPtr(Condition cond, T src, Register dest,
++                                  Label* label) {
++  switch (cond) {
++    case Overflow: {
++      UseScratchRegisterScope temps(asMasm());
++      Register scratch = temps.Acquire();
++      movePtr(dest, scratch);  // scratch = old_dest
++      addPtr(src, dest);       // dest = result = old_dest + src
++      as_xor_(SecondScratchReg, dest,
++              scratch);  // SecondScratch = result ^ old_dest
++      as_subf(scratch, scratch,
++              dest);  // scratch = result - old_dest = src_value
++      as_xor_(scratch, scratch, dest);  // scratch = src_value ^ result
++      // (old_dest ^ result) & (src_value ^ result): bit 63 set iff overflow.
++      // and. record form sets CR0[lt]=(bit 63 set), folding the cmpdi.
++      as_and__rc(scratch, scratch, SecondScratchReg);
++      ma_b(LessThan, label);
++      break;
++    }
++    case NonZero:
++    case Zero:
++      addPtr(src, dest);
++      as_cmpdi(dest, 0);
++      ma_b(cond == NonZero ? NotEqual : Equal, label);
++      break;
++    case Signed:
++    case NotSigned:
++      addPtr(src, dest);
++      as_cmpdi(dest, 0);
++      ma_b(cond == Signed ? LessThan : GreaterThanOrEqual, label);
++      break;
++    case CarryClear:
++    case CarrySet: {
++      // Unsigned 64-bit carry detection: save dest, do 64-bit add,
++      // then unsigned-compare result with original. If result < original
++      // (unsigned), a carry occurred.
++      UseScratchRegisterScope temps(asMasm());
++      Register scratch = temps.Acquire();
++      movePtr(dest, scratch);
++      addPtr(src, dest);
++      as_cmpld(dest, scratch);
++      ma_b(cond == CarrySet ? LessThan : GreaterThanOrEqual, label);
++      break;
++    }
++    default:
++      MOZ_CRASH("NYI");
++  }
++}
++
++template <typename T>
++void MacroAssembler::branchSubPtr(Condition cond, T src, Register dest,
++                                  Label* label) {
++  switch (cond) {
++    case Overflow: {
++      UseScratchRegisterScope temps(asMasm());
++      Register scratch = temps.Acquire();
++      movePtr(dest, scratch);  // scratch = old_dest
++      subPtr(src, dest);       // dest = result = old_dest - src
++      // Overflow if (old_dest ^ src_value) & (old_dest ^ result) has bit 63
++      // set.
++      as_subf(SecondScratchReg, dest,
++              scratch);  // SecondScratch = old_dest - result = src_value
++      as_xor_(SecondScratchReg, scratch,
++              SecondScratchReg);        // old_dest ^ src_value
++      as_xor_(scratch, scratch, dest);  // old_dest ^ result
++      // Record-form AND sets CR0 to the signed compare of the result vs 0,
++      // so a separate cmpdi is unnecessary; LessThan reads CR0.LT.
++      as_and__rc(scratch, scratch, SecondScratchReg);
++      ma_b(LessThan, label);
++      break;
++    }
++    case NonZero:
++    case Zero:
++      subPtr(src, dest);
++      as_cmpdi(dest, 0);
++      ma_b(cond == NonZero ? NotEqual : Equal, label);
++      break;
++    case Signed:
++    case NotSigned:
++      subPtr(src, dest);
++      as_cmpdi(dest, 0);
++      ma_b(cond == Signed ? LessThan : GreaterThanOrEqual, label);
++      break;
++    default:
++      MOZ_CRASH("NYI");
++  }
++}
++
++void MacroAssembler::branchMulPtr(Condition cond, Register src, Register dest,
++                                  Label* label) {
++  MOZ_ASSERT(cond == Assembler::Overflow);
++  as_mulldo(dest, dest, src);
++  ma_b(Overflow, label);
++}
++
++void MacroAssembler::branchNegPtr(Condition cond, Register reg, Label* label) {
++  MOZ_ASSERT(cond == Overflow);
++  negPtr(reg);
++  branchPtr(Assembler::Equal, reg, ImmWord(intptr_t(INTPTR_MIN)), label);
++}
++
++void MacroAssembler::decBranchPtr(Condition cond, Register lhs, Imm32 rhs,
++                                  Label* label) {
++  subPtr(rhs, lhs);
++  branchPtr(cond, lhs, Imm32(0), label);
++}
++
++void MacroAssembler::branchTest32(Condition cond, Register lhs, Register rhs,
++                                  Label* label) {
++  MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
++             cond == NotSigned);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  if (lhs != rhs) {
++    as_and_(scratch, lhs, rhs);
++    as_extsw_rc(scratch, scratch);  // CR0 set on sign-extended i32; folds cmpdi
++  } else {
++    as_extsw_rc(scratch, lhs);
++  }
++  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++  ma_b(base, label);
++}
++
++void MacroAssembler::branchTest32(Condition cond, Register lhs, Imm32 rhs,
++                                  Label* label) {
++  MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
++             cond == NotSigned);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  if (is_uintN(rhs.value, 16)) {
++    as_andi_rc(scratch, lhs, rhs.value);
++    // andi_rc sets CR0 on the masked value, but only the low 16 bits matter
++    // since rhs is a 16-bit unsigned mask — sign of the i32 result is always
++    // 0, so CR0[lt] is always 0. For Signed/NotSigned conditions the answer
++    // is fixed; for Zero/NonZero CR0[eq] is correct.
++  } else {
++    move32(rhs, scratch);
++    as_and_(scratch, lhs, scratch);
++    as_extsw_rc(scratch, scratch);  // CR0 set on sign-extended i32; folds cmpdi
++  }
++  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++  ma_b(base, label);
++}
++
++void MacroAssembler::branchTest32(Condition cond, const Address& lhs, Imm32 rhs,
++                                  Label* label) {
++  MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
++             cond == NotSigned);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load32(lhs, scratch);
++  // and32 picks up the rlwinm contig-mask fast path for non-16-bit-fit
++  // immediates that are a contiguous run of 1-bits (common: tag masks,
++  // header bit-fields). It also emits the trailing extsw.
++  and32(rhs, scratch);
++  as_cmpdi(scratch, 0);
++  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++  ma_b(base, label);
++}
++
++void MacroAssembler::branchTest32(Condition cond, const AbsoluteAddress& lhs,
++                                  Imm32 rhs, Label* label) {
++  MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
++             cond == NotSigned);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(ImmWord((uintptr_t)lhs.addr), scratch);
++  load32(Address(scratch, 0), scratch);
++  and32(rhs, scratch);
++  as_cmpdi(scratch, 0);
++  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++  ma_b(base, label);
++}
++
++void MacroAssembler::branchTestPtr(Condition cond, Register lhs, Register rhs,
++                                   Label* label) {
++  MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
++             cond == NotSigned);
++  if (lhs == rhs) {
++    as_cmpdi(lhs, 0);
++  } else {
++    UseScratchRegisterScope temps(asMasm());
++    Register scratch = temps.Acquire();
++    // Record-form AND sets CR0; no follow-up cmpdi needed.
++    as_and__rc(scratch, lhs, rhs);
++  }
++  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++  ma_b(base, label);
++}
++
++void MacroAssembler::branchTestPtr(Condition cond, Register lhs, Imm32 rhs,
++                                   Label* label) {
++  MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
++             cond == NotSigned);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  if (is_uintN(rhs.value, 16)) {
++    as_andi_rc(scratch, lhs, rhs.value);
++  } else {
++    move32(rhs, scratch);
++    as_and__rc(scratch, lhs, scratch);  // record form folds the cmpdi
++  }
++  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++  ma_b(base, label);
++}
++
++void MacroAssembler::branchTestPtr(Condition cond, Register lhs, ImmWord rhs,
++                                   Label* label) {
++  MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
++             cond == NotSigned);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(rhs, scratch);
++  as_and__rc(scratch, lhs, scratch);
++  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++  ma_b(base, label);
++}
++
++void MacroAssembler::branchTestPtr(Condition cond, const Address& lhs,
++                                   Imm32 rhs, Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(lhs, scratch);
++  // andPtr picks up the rlwinm contig-mask fast path for non-16-bit-fit
++  // immediates that are a contiguous run of 1-bits.
++  andPtr(rhs, scratch);
++  as_cmpdi(scratch, 0);
++  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++  ma_b(base, label);
++}
++
++void MacroAssembler::branchTest64(Condition cond, Register64 lhs,
++                                  Register64 rhs, Register temp, Label* success,
++                                  Label* fail) {
++  MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
++             cond == NotSigned);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  as_and__rc(scratch, lhs.reg, rhs.reg);
++  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++  if (fail) {
++    ma_b(base, success);
++    jump(fail);
++  } else {
++    ma_b(base, success);
++  }
++}
++
++void MacroAssembler::branchTest64(Condition cond, Register64 lhs, Imm64 rhs,
++                                  Label* success, Label* fail) {
++  MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
++             cond == NotSigned);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(ImmWord(uint64_t(rhs.value)), scratch);
++  as_and__rc(scratch, lhs.reg, scratch);
++  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++  if (fail) {
++    ma_b(base, success);
++    jump(fail);
++  } else {
++    ma_b(base, success);
++  }
++}
++
++// ===============================================================
++// Value-type branch functions
++
++void MacroAssembler::branchTestUndefined(Condition cond, Register tag,
++                                         Label* label) {
++  MOZ_ASSERT(cond == Equal || cond == NotEqual);
++  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_UNDEFINED), cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchTestUndefined(Condition cond,
++                                         const ValueOperand& value,
++                                         Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  splitTag(value, scratch);
++  branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_UNDEFINED), label);
++}
++
++void MacroAssembler::branchTestUndefined(Condition cond, const Address& address,
++                                         Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(address, scratch);
++  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_UNDEFINED), label);
++}
++
++void MacroAssembler::branchTestUndefined(Condition cond,
++                                         const BaseIndex& address,
++                                         Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(address, scratch);
++  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_UNDEFINED), label);
++}
++
++void MacroAssembler::branchTestInt32(Condition cond, Register tag,
++                                     Label* label) {
++  MOZ_ASSERT(cond == Equal || cond == NotEqual);
++  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_INT32), cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchTestInt32(Condition cond, const ValueOperand& value,
++                                     Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  splitTag(value, scratch);
++  branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_INT32), label);
++}
++
++void MacroAssembler::branchTestInt32(Condition cond, const Address& address,
++                                     Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(address, scratch);
++  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_INT32), label);
++}
++
++void MacroAssembler::branchTestInt32(Condition cond, const BaseIndex& address,
++                                     Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(address, scratch);
++  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_INT32), label);
++}
++
++void MacroAssembler::branchTestInt32Truthy(bool b, const ValueOperand& value,
++                                           Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  unboxInt32(value, scratch);
++  as_cmpwi(scratch, 0);
++  ma_b(b ? NotEqual : Equal, label);
++}
++
++void MacroAssembler::branchTestDouble(Condition cond, Register tag,
++                                      Label* label) {
++  MOZ_ASSERT(cond == Equal || cond == NotEqual);
++  Condition actual = (cond == Equal) ? BelowOrEqual : Above;
++  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_MAX_DOUBLE), actual);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchTestDouble(Condition cond, const ValueOperand& value,
++                                      Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  splitTag(value, scratch);
++  branchTestDouble(cond, scratch, label);
++}
++
++void MacroAssembler::branchTestDouble(Condition cond, const Address& address,
++                                      Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(address, scratch);
++  branchTestDouble(cond, tag, label);
++}
++
++void MacroAssembler::branchTestDouble(Condition cond, const BaseIndex& address,
++                                      Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(address, scratch);
++  branchTestDouble(cond, tag, label);
++}
++
++void MacroAssembler::branchTestDoubleTruthy(bool b, FloatRegister value,
++                                            Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  xs_li(scratch, 0);
++  as_mtvsrd(ScratchDoubleReg, scratch);
++  as_fcmpu(value, ScratchDoubleReg);
++  DoubleCondition cond = b ? DoubleNotEqual : DoubleEqualOrUnordered;
++  ma_b(cond, label);
++}
++
++void MacroAssembler::branchTestNumber(Condition cond, Register tag,
++                                      Label* label) {
++  MOZ_ASSERT(cond == Equal || cond == NotEqual);
++  Condition actual = (cond == Equal) ? BelowOrEqual : Above;
++  Condition c = ma_cmp(tag, Imm32(JS::detail::ValueUpperInclNumberTag), actual);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchTestNumber(Condition cond, const ValueOperand& value,
++                                      Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  splitTag(value, scratch);
++  branchTestNumber(cond, scratch, label);
++}
++
++void MacroAssembler::branchTestBoolean(Condition cond, Register tag,
++                                       Label* label) {
++  MOZ_ASSERT(cond == Equal || cond == NotEqual);
++  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_BOOLEAN), cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchTestBoolean(Condition cond,
++                                       const ValueOperand& value,
++                                       Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  splitTag(value, scratch);
++  branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_BOOLEAN), label);
++}
++
++void MacroAssembler::branchTestBoolean(Condition cond, const Address& address,
++                                       Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(address, scratch);
++  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_BOOLEAN), label);
++}
++
++void MacroAssembler::branchTestBoolean(Condition cond, const BaseIndex& address,
++                                       Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(address, scratch);
++  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_BOOLEAN), label);
++}
++
++void MacroAssembler::branchTestBooleanTruthy(bool b, const ValueOperand& value,
++                                             Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  unboxBoolean(value, scratch);
++  as_cmpwi(scratch, 0);
++  ma_b(b ? NotEqual : Equal, label);
++}
++
++void MacroAssembler::branchTestString(Condition cond, Register tag,
++                                      Label* label) {
++  MOZ_ASSERT(cond == Equal || cond == NotEqual);
++  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_STRING), cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchTestString(Condition cond, const ValueOperand& value,
++                                      Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  splitTag(value, scratch);
++  branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_STRING), label);
++}
++
++void MacroAssembler::branchTestString(Condition cond, const Address& address,
++                                      Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(address, scratch);
++  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_STRING), label);
++}
++
++void MacroAssembler::branchTestString(Condition cond, const BaseIndex& address,
++                                      Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(address, scratch);
++  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_STRING), label);
++}
++
++void MacroAssembler::branchTestStringTruthy(bool b, const ValueOperand& value,
++                                            Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  unboxString(value, scratch);
++  load32(Address(scratch, JSString::offsetOfLength()), scratch);
++  as_cmpwi(scratch, 0);
++  ma_b(b ? NotEqual : Equal, label);
++}
++
++void MacroAssembler::branchTestSymbol(Condition cond, Register tag,
++                                      Label* label) {
++  MOZ_ASSERT(cond == Equal || cond == NotEqual);
++  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_SYMBOL), cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchTestSymbol(Condition cond, const ValueOperand& value,
++                                      Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  splitTag(value, scratch);
++  branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_SYMBOL), label);
++}
++
++void MacroAssembler::branchTestSymbol(Condition cond, const BaseIndex& address,
++                                      Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(address, scratch);
++  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_SYMBOL), label);
++}
++
++void MacroAssembler::branchTestSymbol(Condition cond, const Address& address,
++                                      Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(address, scratch);
++  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_SYMBOL), label);
++}
++
++void MacroAssembler::branchTestBigInt(Condition cond, Register tag,
++                                      Label* label) {
++  MOZ_ASSERT(cond == Equal || cond == NotEqual);
++  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_BIGINT), cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchTestBigInt(Condition cond, const ValueOperand& value,
++                                      Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  splitTag(value, scratch);
++  branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_BIGINT), label);
++}
++
++void MacroAssembler::branchTestBigInt(Condition cond, const Address& address,
++                                      Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(address, scratch);
++  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_BIGINT), label);
++}
++
++void MacroAssembler::branchTestBigInt(Condition cond, const BaseIndex& address,
++                                      Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(address, scratch);
++  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_BIGINT), label);
++}
++
++void MacroAssembler::branchTestBigIntTruthy(bool b, const ValueOperand& value,
++                                            Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  unboxBigInt(value, scratch);
++  load32(Address(scratch, BigInt::offsetOfDigitLength()), scratch);
++  as_cmpwi(scratch, 0);
++  ma_b(b ? NotEqual : Equal, label);
++}
++
++void MacroAssembler::branchTestNull(Condition cond, Register tag,
++                                    Label* label) {
++  MOZ_ASSERT(cond == Equal || cond == NotEqual);
++  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_NULL), cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchTestNull(Condition cond, const ValueOperand& value,
++                                    Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  splitTag(value, scratch);
++  branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_NULL), label);
++}
++
++void MacroAssembler::branchTestNull(Condition cond, const Address& address,
++                                    Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(address, scratch);
++  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_NULL), label);
++}
++
++void MacroAssembler::branchTestNull(Condition cond, const BaseIndex& address,
++                                    Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(address, scratch);
++  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_NULL), label);
++}
++
++void MacroAssembler::branchTestObject(Condition cond, Register tag,
++                                      Label* label) {
++  MOZ_ASSERT(cond == Equal || cond == NotEqual);
++  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_OBJECT), cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchTestObject(Condition cond, const ValueOperand& value,
++                                      Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  splitTag(value, scratch);
++  branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_OBJECT), label);
++}
++
++void MacroAssembler::branchTestObject(Condition cond, const Address& address,
++                                      Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(address, scratch);
++  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_OBJECT), label);
++}
++
++void MacroAssembler::branchTestObject(Condition cond, const BaseIndex& address,
++                                      Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(address, scratch);
++  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_OBJECT), label);
++}
++
++void MacroAssembler::branchTestPrimitive(Condition cond,
++                                         const ValueOperand& value,
++                                         Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  splitTag(value, scratch);
++  branchTestPrimitive(cond, scratch, label);
++}
++
++void MacroAssembler::branchTestGCThing(Condition cond, const Address& address,
++                                       Label* label) {
++  branchTestGCThingImpl(cond, address, label);
++}
++
++void MacroAssembler::branchTestGCThing(Condition cond, const BaseIndex& address,
++                                       Label* label) {
++  branchTestGCThingImpl(cond, address, label);
++}
++
++void MacroAssembler::branchTestGCThing(Condition cond,
++                                       const ValueOperand& address,
++                                       Label* label) {
++  branchTestGCThingImpl(cond, address, label);
++}
++
++template <typename T>
++void MacroAssembler::branchTestGCThingImpl(Condition cond, const T& address,
++                                           Label* label) {
++  MOZ_ASSERT(cond == Equal || cond == NotEqual);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(address, scratch);
++  Condition actual = (cond == Equal) ? AboveOrEqual : Below;
++  Condition c =
++      ma_cmp(tag, Imm32(JS::detail::ValueLowerInclGCThingTag), actual);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchTestPrimitive(Condition cond, Register tag,
++                                         Label* label) {
++  MOZ_ASSERT(cond == Equal || cond == NotEqual);
++  Condition actual = (cond == Equal) ? Below : AboveOrEqual;
++  Condition c =
++      ma_cmp(tag, Imm32(JS::detail::ValueUpperExclPrimitiveTag), actual);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchTestMagic(Condition cond, Register tag,
++                                     Label* label) {
++  MOZ_ASSERT(cond == Equal || cond == NotEqual);
++  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_MAGIC), cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchTestMagic(Condition cond, const Address& address,
++                                     Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(address, scratch);
++  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_MAGIC), label);
++}
++
++void MacroAssembler::branchTestMagic(Condition cond, const BaseIndex& address,
++                                     Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(address, scratch);
++  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_MAGIC), label);
++}
++
++void MacroAssembler::branchTestMagic(Condition cond, const ValueOperand& value,
++                                     Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  splitTag(value, scratch);
++  branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_MAGIC), label);
++}
++
++void MacroAssembler::branchTestMagic(Condition cond, const Address& valaddr,
++                                     JSWhyMagic why, Label* label) {
++  uint64_t magic = MagicValue(why).asRawBits();
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(valaddr, scratch);
++  Condition c = ma_cmp(scratch, ImmWord(magic), cond);
++  ma_b(c, label);
++}
++
++void MacroAssembler::branchTestMagic(Condition cond, const BaseIndex& valaddr,
++                                     JSWhyMagic why, Label* label) {
++  uint64_t magic = MagicValue(why).asRawBits();
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(valaddr, scratch);
++  Condition c = ma_cmp(scratch, ImmWord(magic), cond);
++  ma_b(c, label);
++}
++
++template <typename T>
++void MacroAssembler::branchTestValue(Condition cond, const T& lhs,
++                                     const ValueOperand& rhs, Label* label) {
++  MOZ_ASSERT(cond == Equal || cond == NotEqual);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(lhs, scratch);
++  Condition c = ma_cmp(scratch, rhs.valueReg(), cond);
++  ma_b(c, label);
++}
++
++// ===============================================================
++// Test-set functions
++
++template <typename T>
++void MacroAssembler::testNumberSet(Condition cond, const T& src,
++                                   Register dest) {
++  MOZ_ASSERT(cond == Equal || cond == NotEqual);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(src, scratch);
++  Condition actual = (cond == Equal) ? BelowOrEqual : Above;
++  Condition c = ma_cmp(tag, Imm32(JS::detail::ValueUpperInclNumberTag), actual);
++  ma_cmp_set(dest, c);
++}
++
++template <typename T>
++void MacroAssembler::testBooleanSet(Condition cond, const T& src,
++                                    Register dest) {
++  MOZ_ASSERT(cond == Equal || cond == NotEqual);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(src, scratch);
++  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_BOOLEAN), cond);
++  ma_cmp_set(dest, c);
++}
++
++template <typename T>
++void MacroAssembler::testStringSet(Condition cond, const T& src,
++                                   Register dest) {
++  MOZ_ASSERT(cond == Equal || cond == NotEqual);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(src, scratch);
++  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_STRING), cond);
++  ma_cmp_set(dest, c);
++}
++
++template <typename T>
++void MacroAssembler::testSymbolSet(Condition cond, const T& src,
++                                   Register dest) {
++  MOZ_ASSERT(cond == Equal || cond == NotEqual);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(src, scratch);
++  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_SYMBOL), cond);
++  ma_cmp_set(dest, c);
++}
++
++template <typename T>
++void MacroAssembler::testBigIntSet(Condition cond, const T& src,
++                                   Register dest) {
++  MOZ_ASSERT(cond == Equal || cond == NotEqual);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  Register tag = extractTag(src, scratch);
++  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_BIGINT), cond);
++  ma_cmp_set(dest, c);
++}
++
++// ===============================================================
++// Computed address / conditional move / conditional load
++
++void MacroAssembler::branchToComputedAddress(const BaseIndex& addr) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(addr, scratch);
++  branch(scratch);
++}
++
++void MacroAssembler::cmp32Move32(Condition cond, Register lhs, Imm32 rhs,
++                                 Register src, Register dest) {
++  Condition c = ma_cmp(lhs, rhs, cond, true);
++  ma_cmp_move(dest, src, c);
++}
++
++void MacroAssembler::cmp32Move32(Condition cond, Register lhs, Register rhs,
++                                 Register src, Register dest) {
++  Condition c = ma_cmp(lhs, rhs, cond, true);
++  ma_cmp_move(dest, src, c);
++}
++
++void MacroAssembler::cmp32Move32(Condition cond, Register lhs,
++                                 const Address& rhs, Register src,
++                                 Register dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load32(rhs, scratch);
++  Condition c = ma_cmp(lhs, scratch, cond, true);
++  ma_cmp_move(dest, src, c);
++}
++
++void MacroAssembler::cmp32MovePtr(Condition cond, Register lhs, Imm32 rhs,
++                                  Register src, Register dest) {
++  Condition c = ma_cmp(lhs, rhs, cond, true);
++  ma_cmp_move(dest, src, c);
++}
++
++void MacroAssembler::cmpPtrMovePtr(Condition cond, Register lhs, Imm32 rhs,
++                                   Register src, Register dest) {
++  Condition c = ma_cmp(lhs, ImmWord(int64_t(rhs.value)), cond);
++  ma_cmp_move(dest, src, c);
++}
++
++void MacroAssembler::cmpPtrMovePtr(Condition cond, Register lhs, Register rhs,
++                                   Register src, Register dest) {
++  Condition c = ma_cmp(lhs, rhs, cond);
++  ma_cmp_move(dest, src, c);
++}
++
++void MacroAssembler::cmpPtrMovePtr(Condition cond, Register lhs,
++                                   const Address& rhs, Register src,
++                                   Register dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(rhs, scratch);
++  Condition c = ma_cmp(lhs, scratch, cond);
++  ma_cmp_move(dest, src, c);
++}
++
++void MacroAssembler::cmp32Load32(Condition cond, Register lhs,
++                                 const Address& rhs, const Address& src,
++                                 Register dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load32(rhs, scratch);
++  Condition c = ma_cmp(lhs, scratch, cond, true);
++  // Conditional load: load into scratch, then isel.
++  load32(src, scratch);
++  ma_cmp_move(dest, scratch, c);
++}
++
++void MacroAssembler::cmp32Load32(Condition cond, Register lhs, Register rhs,
++                                 const Address& src, Register dest) {
++  Condition c = ma_cmp(lhs, rhs, cond, true);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load32(src, scratch);
++  ma_cmp_move(dest, scratch, c);
++}
++
++void MacroAssembler::cmp32Load32(Condition cond, Register lhs, Imm32 rhs,
++                                 const Address& src, Register dest) {
++  Condition c = ma_cmp(lhs, rhs, cond, true);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load32(src, scratch);
++  ma_cmp_move(dest, scratch, c);
++}
++
++void MacroAssembler::cmp32LoadPtr(Condition cond, const Address& lhs, Imm32 rhs,
++                                  const Address& src, Register dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load32(lhs, scratch);
++  Condition c = ma_cmp(scratch, rhs, cond, true);
++  loadPtr(src, scratch);
++  ma_cmp_move(dest, scratch, c);
++}
++
++void MacroAssembler::test32LoadPtr(Condition cond, const Address& addr,
++                                   Imm32 mask, const Address& src,
++                                   Register dest) {
++  MOZ_ASSERT(cond == Zero || cond == NonZero);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load32(addr, scratch);
++  if (is_uintN(mask.value, 16)) {
++    as_andi_rc(scratch, scratch, mask.value);
++  } else {
++    // Use a nested scope so scratch2 is released before loadPtr below.
++    UseScratchRegisterScope temps2(asMasm());
++    Register scratch2 = temps2.Acquire();
++    move32(mask, scratch2);
++    as_and__rc(scratch, scratch, scratch2);  // record form folds the cmpdi
++  }
++  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++  loadPtr(src, scratch);
++  ma_cmp_move(dest, scratch, base);
++}
++
++void MacroAssembler::test32MovePtr(Condition cond, Register operand, Imm32 mask,
++                                   Register src, Register dest) {
++  MOZ_ASSERT(cond == Zero || cond == NonZero);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  if (is_uintN(mask.value, 16)) {
++    as_andi_rc(scratch, operand, mask.value);
++  } else {
++    move32(mask, scratch);
++    as_and__rc(scratch, operand, scratch);  // record form folds the cmpdi
++  }
++  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++  ma_cmp_move(dest, src, base);
++}
++
++void MacroAssembler::test32MovePtr(Condition cond, const Address& addr,
++                                   Imm32 mask, Register src, Register dest) {
++  MOZ_ASSERT(cond == Zero || cond == NonZero);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load32(addr, scratch);
++  and32(mask, scratch);
++  as_cmpdi(scratch, 0);
++  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++  ma_cmp_move(dest, src, base);
++}
++
++// ===============================================================
++// Spectre mitigations
++
++void MacroAssembler::spectreMovePtr(Condition cond, Register src,
++                                    Register dest) {
++  // Assumes compare already issued.
++  Condition base = static_cast<Condition>(
++      cond & ~(Assembler::ConditionUnsigned | Assembler::ConditionZero));
++  ma_cmp_move(dest, src, base);
++}
++
++void MacroAssembler::spectreZeroRegister(Condition cond, Register scratch,
++                                         Register dest) {
++  // Assumes compare already issued. Zero dest if condition is true.
++  Condition origBase = static_cast<Condition>(
++      cond & ~(Assembler::ConditionUnsigned | Assembler::ConditionZero));
++  // If original condition is true, we want dest=0.
++  // isel: if condition true, select zero; else keep dest.
++  xs_li(scratch, 0);
++  ma_cmp_move(dest, scratch, origBase);
++}
++
++void MacroAssembler::spectreBoundsCheck32(Register index, Register length,
++                                          Register maybeScratch,
++                                          Label* failure) {
++  Condition c = ma_cmp(index, length, Below, true);
++  if (failure) {
++    ma_b(InvertCondition(c), failure);
++  }
++  if (maybeScratch != InvalidReg) {
++    xs_li(maybeScratch, 0);
++    ma_cmp_move(index, maybeScratch, InvertCondition(c));
++  }
++}
++
++void MacroAssembler::spectreBoundsCheck32(Register index, const Address& length,
++                                          Register maybeScratch,
++                                          Label* failure) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load32(length, scratch);
++  spectreBoundsCheck32(index, scratch, maybeScratch, failure);
++}
++
++void MacroAssembler::spectreBoundsCheckPtr(Register index, Register length,
++                                           Register maybeScratch,
++                                           Label* failure) {
++  Condition c = ma_cmp(index, length, Below);
++  if (failure) {
++    ma_b(InvertCondition(c), failure);
++  }
++  if (maybeScratch != InvalidReg) {
++    xs_li(maybeScratch, 0);
++    ma_cmp_move(index, maybeScratch, InvertCondition(c));
++  }
++}
++
++void MacroAssembler::spectreBoundsCheckPtr(Register index,
++                                           const Address& length,
++                                           Register maybeScratch,
++                                           Label* failure) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(length, scratch);
++  spectreBoundsCheckPtr(index, scratch, maybeScratch, failure);
++}
++
++// ===============================================================
++// Memory access primitives
++
++FaultingCodeOffset MacroAssembler::storeFloat32(FloatRegister src,
++                                                const Address& addr) {
++  MOZ_ASSERT(addr.base != r0);
++  if (is_intN(addr.offset, 16)) {
++    return FaultingCodeOffset(as_stfs(src, addr.base, addr.offset).getOffset());
++  }
++  if (HasPOWER10() && is_intN((intptr_t)addr.offset, 34)) {
++    return FaultingCodeOffset(
++        as_pstfs(src, addr.base, (int64_t)addr.offset, /*R=*/false)
++            .getOffset());
++  }
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(ImmWord(addr.offset), scratch);
++  return FaultingCodeOffset(as_stfsx(src, addr.base, scratch).getOffset());
++}
++
++FaultingCodeOffset MacroAssembler::storeFloat32(FloatRegister src,
++                                                const BaseIndex& addr) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  computeEffectiveAddress(addr, scratch);
++  return FaultingCodeOffset(as_stfs(src, scratch, 0).getOffset());
++}
++
++FaultingCodeOffset MacroAssembler::storeDouble(FloatRegister src,
++                                               const Address& addr) {
++  MOZ_ASSERT(addr.base != r0);
++  if (is_intN(addr.offset, 16)) {
++    return FaultingCodeOffset(as_stfd(src, addr.base, addr.offset).getOffset());
++  }
++  if (HasPOWER10() && is_intN((intptr_t)addr.offset, 34)) {
++    return FaultingCodeOffset(
++        as_pstfd(src, addr.base, (int64_t)addr.offset, /*R=*/false)
++            .getOffset());
++  }
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  movePtr(ImmWord(addr.offset), scratch);
++  return FaultingCodeOffset(as_stfdx(src, addr.base, scratch).getOffset());
++}
++
++FaultingCodeOffset MacroAssembler::storeDouble(FloatRegister src,
++                                               const BaseIndex& addr) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  computeEffectiveAddress(addr, scratch);
++  return FaultingCodeOffset(as_stfd(src, scratch, 0).getOffset());
++}
++
++FaultingCodeOffset MacroAssembler::storeFloat16(FloatRegister src,
++                                                const Address& dest,
++                                                Register temp) {
++  MOZ_ASSERT(HasPOWER9());
++  if (dest.offset == 0) {
++    return FaultingCodeOffset(as_stxsihx(src, r0, dest.base).getOffset());
++  }
++  if (is_intN(dest.offset, 16)) {
++    as_addi(temp, dest.base, dest.offset);
++    return FaultingCodeOffset(as_stxsihx(src, r0, temp).getOffset());
++  }
++  movePtr(ImmWord(dest.offset), temp);
++  return FaultingCodeOffset(as_stxsihx(src, dest.base, temp).getOffset());
++}
++
++FaultingCodeOffset MacroAssembler::storeFloat16(FloatRegister src,
++                                                const BaseIndex& dest,
++                                                Register temp) {
++  MOZ_ASSERT(HasPOWER9());
++  computeEffectiveAddress(dest, temp);
++  return FaultingCodeOffset(as_stxsihx(src, r0, temp).getOffset());
++}
++
++void MacroAssembler::memoryBarrier(MemoryBarrier barrier) {
++  if (barrier.isNone()) {
++    return;
++  }
++  if (barrier.hasStoreLoad() || barrier.hasSync()) {
++    as_sync();
++  } else {
++    as_lwsync();
++  }
++}
++
++// ===============================================================
++// Clamping functions
++
++void MacroAssembler::clampIntToUint8(Register reg) {
++  // Clamp to [0, 255].
++  Label done;
++  as_cmpwi(reg, 255);
++  ma_b(LessThanOrEqual, &done);
++  move32(Imm32(255), reg);
++  bind(&done);
++  Label positive;
++  as_cmpwi(reg, 0);
++  ma_b(GreaterThanOrEqual, &positive);
++  move32(Imm32(0), reg);
++  bind(&positive);
++}
++
++// ===============================================================
++// Unboxing
++
++void MacroAssembler::fallibleUnboxPtr(const ValueOperand& src, Register dest,
++                                      JSValueType type, Label* fail) {
++  MOZ_ASSERT(type == JSVAL_TYPE_OBJECT || type == JSVAL_TYPE_STRING ||
++             type == JSVAL_TYPE_SYMBOL || type == JSVAL_TYPE_BIGINT);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  splitTag(src, scratch);
++  Condition c = ma_cmp(scratch, ImmTag(JSVAL_TYPE_TO_TAG(type)), NotEqual);
++  ma_b(c, fail);
++  unboxNonDouble(src, dest, type);
++}
++
++void MacroAssembler::fallibleUnboxPtr(const Address& src, Register dest,
++                                      JSValueType type, Label* fail) {
++  loadValue(src, ValueOperand(dest));
++  fallibleUnboxPtr(ValueOperand(dest), dest, type, fail);
++}
++
++void MacroAssembler::fallibleUnboxPtr(const BaseIndex& src, Register dest,
++                                      JSValueType type, Label* fail) {
++  loadValue(src, ValueOperand(dest));
++  fallibleUnboxPtr(ValueOperand(dest), dest, type, fail);
++}
++
++void MacroAssembler::wasmAddSubI128HI64(Register lhsLo, Register lhsHi,
++                                        Register rhsLo, Register rhsHi,
++                                        Register output, bool isAdd) {
++  MOZ_RELEASE_ASSERT(output != lhsLo && output != lhsHi && output != rhsLo &&
++                     output != rhsHi);
++  if (isAdd) {
++    // addc sets CA (carry), adde uses it.
++    as_addc(output, lhsLo, rhsLo);  // output = lhsLo + rhsLo, CA = carry
++    as_adde(output, lhsHi, rhsHi);  // output = lhsHi + rhsHi + CA
++  } else {
++    // subfc: rd = rb - ra, sets CA (borrow complement).
++    // subfe: rd = rb + ~ra + CA.
++    as_subfc(output, rhsLo, lhsLo);  // output = lhsLo - rhsLo, CA = ~borrow
++    as_subfe(output, rhsHi, lhsHi);  // output = lhsHi - rhsHi - borrow
++  }
++}
++
++void MacroAssembler::wasmMulI64WideHI64(Register lhs, Register rhs,
++                                        Register output, bool isSigned) {
++  if (isSigned) {
++    as_mulhd(output, lhs, rhs);
++  } else {
++    as_mulhdu(output, lhs, rhs);
++  }
++}
++
++//}}} check_macroassembler_style
++
++void MacroAssemblerPPC64Compat::incrementInt32Value(const Address& addr) {
++  asMasm().add32(Imm32(1), addr);
++}
++
++void MacroAssemblerPPC64Compat::retn(Imm32 n) {
++  // Load return address from [SP,0] first, then adjust SP, then return.
++  // Must load RA before adjusting SP (like loong64), since the RA is at
++  // the current top of stack, not at SP+n.
++  UseScratchRegisterScope temps(*this);
++  Register scratch = temps.Acquire();
++  as_ld(scratch, StackPointer, 0);
++  if (n.value != 0) {
++    asMasm().addPtr(Imm32(n.value), StackPointer);
++  }
++  xs_mtlr(scratch);
++  as_blr();
++}
++
++// ===============================================================
++// Template specializations (outside check_macroassembler_style)
++
++template <>
++inline void MacroAssembler::cmpPtrSet(Assembler::Condition cond, Address lhs,
++                                      ImmPtr rhs, Register dest) {
++  UseScratchRegisterScope temps(*this);
++  Register scratch = temps.Acquire();
++  loadPtr(lhs, scratch);
++  Assembler::Condition c = ma_cmp(scratch, rhs, cond);
++  ma_cmp_set(dest, c);
++}
++
++template <>
++inline void MacroAssembler::cmpPtrSet(Assembler::Condition cond, Register lhs,
++                                      Address rhs, Register dest) {
++  UseScratchRegisterScope temps(*this);
++  Register scratch = temps.Acquire();
++  loadPtr(rhs, scratch);
++  Assembler::Condition c = ma_cmp(lhs, scratch, cond);
++  ma_cmp_set(dest, c);
++}
++
++template <>
++inline void MacroAssembler::cmpPtrSet(Assembler::Condition cond, Address lhs,
++                                      Register rhs, Register dest) {
++  UseScratchRegisterScope temps(*this);
++  Register scratch = temps.Acquire();
++  loadPtr(lhs, scratch);
++  Assembler::Condition c = ma_cmp(scratch, rhs, cond);
++  ma_cmp_set(dest, c);
++}
++
++template <>
++inline void MacroAssembler::cmp32Set(Assembler::Condition cond, Register lhs,
++                                     Address rhs, Register dest) {
++  UseScratchRegisterScope temps(*this);
++  Register scratch = temps.Acquire();
++  load32(rhs, scratch);
++  Assembler::Condition c = ma_cmp(lhs, scratch, cond, true);
++  ma_cmp_set(dest, c);
++}
++
++template <>
++inline void MacroAssembler::cmp32Set(Assembler::Condition cond, Address lhs,
++                                     Register rhs, Register dest) {
++  UseScratchRegisterScope temps(*this);
++  Register scratch = temps.Acquire();
++  load32(lhs, scratch);
++  Assembler::Condition c = ma_cmp(scratch, rhs, cond, true);
++  ma_cmp_set(dest, c);
++}
++
++template <>
++inline void MacroAssembler::cmp32Set(Assembler::Condition cond, Address lhs,
++                                     Imm32 rhs, Register dest) {
++  UseScratchRegisterScope temps(*this);
++  Register scratch = temps.Acquire();
++  load32(lhs, scratch);
++  Assembler::Condition c = ma_cmp(scratch, rhs, cond, true);
++  ma_cmp_set(dest, c);
++}
++
++//{{{ check_macroassembler_style
++// ===============================================================
++// SIMD load/store (128-bit)
++
++FaultingCodeOffset MacroAssembler::loadUnalignedSimd128(const Address& src,
++                                                        FloatRegister dest) {
++  UseScratchRegisterScope temps(asMasm());
++  if (HasPOWER10() && is_intN((intptr_t)src.offset, 34)) {
++    // POWER10 prefixed load — natural-LE byte order, no GPR scratch.
++    return FaultingCodeOffset(
++        as_plxv(dest.encoding(), src.base, (int64_t)src.offset, /*R=*/false)
++            .getOffset());
++  }
++  if (HasPOWER9()) {
++    // POWER9: lxvx (X-form, indexed) loads 128 bits in correct LE order.
++    Register scratch = temps.Acquire();
++    if (src.offset == 0) {
++      // RA=0 means "use 0 as base" in indexed forms, so use r0 encoding.
++      return FaultingCodeOffset(as_lxvx(dest, r0, src.base).getOffset());
++    }
++    movePtr(ImmWord(src.offset), scratch);
++    return FaultingCodeOffset(as_lxvx(dest, src.base, scratch).getOffset());
++  }
++  // POWER8: lxvd2x loads with doubleword swap on LE. Fix with xxpermdi.
++  Register scratch = temps.Acquire();
++  FaultingCodeOffset fco;
++  if (src.offset == 0) {
++    fco = FaultingCodeOffset(as_lxvd2x(dest, r0, src.base).getOffset());
++  } else {
++    movePtr(ImmWord(src.offset), scratch);
++    fco = FaultingCodeOffset(as_lxvd2x(dest, src.base, scratch).getOffset());
++  }
++  as_xxpermdi(dest, dest, dest, 2);
++  return fco;
++}
++
++FaultingCodeOffset MacroAssembler::loadUnalignedSimd128(const BaseIndex& src,
++                                                        FloatRegister dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  computeScaledAddress(src, scratch);
++  if (src.offset != 0) {
++    // addPtr picks up POWER10 paddi (1 prefixed insn) when available;
++    // falls back to movePtr+add on P9/P8. Drops the explicit scratch2.
++    addPtr(ImmWord(src.offset), scratch);
++  }
++  if (HasPOWER9()) {
++    return FaultingCodeOffset(as_lxvx(dest, r0, scratch).getOffset());
++  }
++  FaultingCodeOffset fco(as_lxvd2x(dest, r0, scratch).getOffset());
++  as_xxpermdi(dest, dest, dest, 2);
++  return fco;
++}
++
++FaultingCodeOffset MacroAssembler::storeUnalignedSimd128(FloatRegister src,
++                                                         const Address& dest) {
++  UseScratchRegisterScope temps(asMasm());
++  if (HasPOWER10() && is_intN((intptr_t)dest.offset, 34)) {
++    // POWER10 prefixed store — natural-LE byte order, no GPR scratch.
++    return FaultingCodeOffset(
++        as_pstxv(src.encoding(), dest.base, (int64_t)dest.offset, /*R=*/false)
++            .getOffset());
++  }
++  if (HasPOWER9()) {
++    Register scratch = temps.Acquire();
++    if (dest.offset == 0) {
++      return FaultingCodeOffset(as_stxvx(src, r0, dest.base).getOffset());
++    }
++    movePtr(ImmWord(dest.offset), scratch);
++    return FaultingCodeOffset(as_stxvx(src, dest.base, scratch).getOffset());
++  }
++  // POWER8: stxvd2x stores with doubleword swap on LE.
++  // Swap before store, then swap back to restore the register.
++  ScratchSimd128Scope scratch128(*this);
++  as_xxpermdi(scratch128, src, src, 2);
++  Register scratch = temps.Acquire();
++  FaultingCodeOffset fco;
++  if (dest.offset == 0) {
++    fco = FaultingCodeOffset(as_stxvd2x(scratch128, r0, dest.base).getOffset());
++  } else {
++    movePtr(ImmWord(dest.offset), scratch);
++    fco = FaultingCodeOffset(
++        as_stxvd2x(scratch128, dest.base, scratch).getOffset());
++  }
++  return fco;
++}
++
++FaultingCodeOffset MacroAssembler::storeUnalignedSimd128(
++    FloatRegister src, const BaseIndex& dest) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  computeScaledAddress(dest, scratch);
++  if (dest.offset != 0) {
++    addPtr(ImmWord(dest.offset), scratch);
++  }
++  if (HasPOWER9()) {
++    return FaultingCodeOffset(as_stxvx(src, r0, scratch).getOffset());
++  }
++  ScratchSimd128Scope scratch128(*this);
++  as_xxpermdi(scratch128, src, src, 2);
++  return FaultingCodeOffset(as_stxvd2x(scratch128, r0, scratch).getOffset());
++}
++
++// ===============================================================
++// SIMD operations
++//
++// Scratch register conventions for SIMD helpers (read this before writing
++// a new one):
++//
++// 1. `ScratchSimd128Scope scratch(*this)` — acquires v0 (= VR0 = VSR32,
++//    non-allocatable). Constructed as {FloatRegisters::f0, Simd128} so
++//    encoding() = 0 + 32 = 32 (per Architecture-ppc64.h). Default temp.
++//    One scope at a time per helper. Safe to pass to any VMX/VSX
++//    instruction; the allocator never places a live v128 in v0.
++//
++// 2. **Do NOT** write to VR1..VR31 (= VSR33..VSR63) without a Lowering
++//    temp. VR1..VR31 are allocatable; a live wasm v128 may be sitting in
++//    any of them. Use `ScratchSimd128Scope` (rule 1) or a Lowering temp.
++//
++// 3. **Red-zone stash** — use `RedZoneStashSimd128` / `RedZoneRestoreSimd128`
++//    (declared just below) when a helper genuinely needs >1 SIMD scratch
++//    AND adding a Lowering temp would require LIR + MIR + CodeGen changes.
++//    ELFv2 reserves 288 bytes below SP; we use at most 32 (two 16-byte
++//    slots). Live users: `extAddPairwiseInt*` (2 slots), `swizzleInt8x16`
++//    (1 slot), `dotInt8x16Int7x16ThenAdd` 4-arg (1 slot). If you find
++//    yourself wanting a 3rd slot or nested save/restore, prefer a Lowering
++//    temp instead — the red-zone approach is tolerable because it's
++//    self-contained to a single helper. The `MOZ_ASSERT(slot < 2)` inside
++//    the helpers enforces this at test time.
++//
++// Simd128 lives in VR-namespace (VSR32-63), so VMX ops address Simd128
++// FloatRegisters directly with no staging. Encoding is 32-63; the VMX
++// VR field is 5-bit (0-31), so we mask with `& 31`.
++
++// Two 16-byte Simd128 slots available in the ELFv2 red zone for short-lived
++// SIMD spills (see point 3 of the SIMD conventions preamble above).
++static constexpr int kRedZoneSimd128MaxSlots = 2;
++
++static inline void RedZoneStashSimd128(MacroAssembler& masm, FloatRegister src,
++                                       int slot) {
++  MOZ_ASSERT(slot >= 0 && slot < kRedZoneSimd128MaxSlots);
++  masm.storeUnalignedSimd128(src, Address(StackPointer, -16 * (slot + 1)));
++}
++
++static inline void RedZoneRestoreSimd128(MacroAssembler& masm, int slot,
++                                         FloatRegister dest) {
++  MOZ_ASSERT(slot >= 0 && slot < kRedZoneSimd128MaxSlots);
++  masm.loadUnalignedSimd128(Address(StackPointer, -16 * (slot + 1)), dest);
++}
++
++typedef void (*VmxBinaryFn)(Assembler&, uint8_t, uint8_t, uint8_t);
++
++static void EmitVmxBinary(MacroAssembler& masm, VmxBinaryFn vmxOp,
++                          FloatRegister lhs, FloatRegister rhs,
++                          FloatRegister dest) {
++  vmxOp(static_cast<Assembler&>(masm), dest.encoding() & 31,
++        lhs.encoding() & 31, rhs.encoding() & 31);
++}
++
++// Macro for defining VMX binary wrappers.
++#define VMX_BINARY_WRAPPER(vmxInst)                         \
++  [](Assembler& a, uint8_t vrt, uint8_t vra, uint8_t vrb) { \
++    a.as_##vmxInst(vrt, vra, vrb);                          \
++  }
++
++// Emit op directly on Simd128 dest, then xxlnor in place.
++template <typename VmxBinaryFnT>
++static void EmitVmxBinaryNot(MacroAssembler& masm, VmxBinaryFnT vmxOp,
++                             FloatRegister lhs, FloatRegister rhs,
++                             FloatRegister dest) {
++  vmxOp(static_cast<Assembler&>(masm), dest.encoding() & 31,
++        lhs.encoding() & 31, rhs.encoding() & 31);
++  masm.as_xxlnor(dest, dest, dest);
++}
++
++// Integer SIMD compare helper. VMX compare instructions produce all-ones
++// for true, all-zeros for false per element.
++// Available VMX compares: vcmpequ* (eq), vcmpgts* (signed gt), vcmpgtu*
++// (unsigned gt). Other conditions derived by swapping operands or
++// complementing.
++template <typename EqFn, typename GtsFn, typename GtuFn>
++static void EmitVmxCompare(MacroAssembler& masm, Assembler::Condition cond,
++                           FloatRegister lhs, FloatRegister rhs,
++                           FloatRegister dest, EqFn eqFn, GtsFn gtsFn,
++                           GtuFn gtuFn) {
++  switch (cond) {
++    case Assembler::Equal:
++      EmitVmxBinary(masm, eqFn, lhs, rhs, dest);
++      break;
++    case Assembler::NotEqual:
++      EmitVmxBinaryNot(masm, eqFn, lhs, rhs, dest);
++      break;
++    case Assembler::GreaterThan:
++      EmitVmxBinary(masm, gtsFn, lhs, rhs, dest);
++      break;
++    case Assembler::GreaterThanOrEqual:
++      // !(rhs > lhs)
++      EmitVmxBinaryNot(masm, gtsFn, rhs, lhs, dest);
++      break;
++    case Assembler::LessThan:
++      // rhs > lhs (swap)
++      EmitVmxBinary(masm, gtsFn, rhs, lhs, dest);
++      break;
++    case Assembler::LessThanOrEqual:
++      // !(lhs > rhs)
++      EmitVmxBinaryNot(masm, gtsFn, lhs, rhs, dest);
++      break;
++    case Assembler::Above:
++      EmitVmxBinary(masm, gtuFn, lhs, rhs, dest);
++      break;
++    case Assembler::AboveOrEqual:
++      EmitVmxBinaryNot(masm, gtuFn, rhs, lhs, dest);
++      break;
++    case Assembler::Below:
++      EmitVmxBinary(masm, gtuFn, rhs, lhs, dest);
++      break;
++    case Assembler::BelowOrEqual:
++      EmitVmxBinaryNot(masm, gtuFn, lhs, rhs, dest);
++      break;
++    default:
++      MOZ_CRASH("Unexpected SIMD integer condition");
++  }
++}
++
++// Emit ternary VMX op directly on Simd128 regs, no staging.
++typedef void (*VmxTernaryFn)(Assembler&, uint8_t, uint8_t, uint8_t, uint8_t);
++
++static void EmitVmxTernary(MacroAssembler& masm, VmxTernaryFn vmxOp,
++                           FloatRegister a, FloatRegister b, FloatRegister c,
++                           FloatRegister dest) {
++  vmxOp(static_cast<Assembler&>(masm), dest.encoding() & 31, a.encoding() & 31,
++        b.encoding() & 31, c.encoding() & 31);
++}
++
++// Emit unary VMX op directly on Simd128 regs, no staging.
++typedef void (*VmxUnaryFn)(Assembler&, uint8_t, uint8_t);
++
++static void EmitVmxUnary(MacroAssembler& masm, VmxUnaryFn vmxOp,
++                         FloatRegister src, FloatRegister dest) {
++  vmxOp(static_cast<Assembler&>(masm), dest.encoding() & 31,
++        src.encoding() & 31);
++}
++
++// Helper: create a zero SIMD register using xxlxor.
++static void ZeroSimd128(MacroAssembler& masm, FloatRegister dest) {
++  masm.as_xxlxor(dest, dest, dest);
++}
++
++void MacroAssembler::moveSimd128(FloatRegister src, FloatRegister dest) {
++  if (src != dest) {
++    as_xxlor(dest, src, src);
++  }
++}
++
++void MacroAssembler::loadConstantSimd128(const SimdConstant& v,
++                                         FloatRegister dest) {
++  // Load 128-bit constant from inline constant pool.
++  // Clobbers SecondScratchReg (r12).
++  loadFromPoolSimd128(dest, v);
++}
++
++// PPC64 LE lane mapping:
++// Wasm lane K = memory byte K = register byte (15-K).
++// mfvsrd extracts register bits[0:63] = BE dword 0 = Wasm lanes 8-15 (bytes).
++// For VMX byte ops, BE byte index = 15 - wasm_lane.
++// For VMX halfword ops, BE halfword index = 7 - wasm_halfword.
++// For VSX word ops (xxspltw), BE word index = 3 - wasm_word.
++// For doubleword ops, BE dword index = 1 - wasm_dword.
++
++void MacroAssembler::splatX16(Register src, FloatRegister dest) {
++  // mtvsrd writes src into BE 0..63 of dest (low byte at BE byte 7);
++  // vspltb then splats that byte over all 16 lanes. dest aliases as
++  // both source and destination — vspltb tolerates this. No extra
++  // scratch register required, so callers that already hold a
++  // ScratchSimd128Scope (extAddPairwise*, var-shift narrow forms) do
++  // not see a nested-acquire collision.
++  as_mtvsrd(dest, src);
++  as_vspltb(dest, dest, 7);
++}
++
++void MacroAssembler::splatX8(Register src, FloatRegister dest) {
++  // Same shape as splatX16 with halfword granularity. mtvsrd places
++  // the low 16 bits at BE halfword 3 (= BE bytes 6..7); vsplth picks
++  // it up and splats across 8 lanes. vsplth reads only the chosen
++  // halfword, so negative i32 inputs do not need a 16-bit pre-mask
++  // (which the previous GPR-replicate path required).
++  as_mtvsrd(dest, src);
++  as_vsplth(dest, dest, 3);
++}
++
++void MacroAssembler::splatX4(Register src, FloatRegister dest) {
++  if (HasPOWER9()) {
++    as_mtvsrws(dest, src);
++  } else {
++    as_mtvsrd(dest, src);
++    as_xxspltw(dest, dest, 1);
++  }
++}
++
++void MacroAssembler::splatX4(FloatRegister src, FloatRegister dest) {
++  // src is a double-precision FPR holding a float value (the JIT keeps
++  // FP32 in DP-equivalent form on PPC64). Convert DP→SP into BE word 0
++  // (xscvdpspn lays the single at bits[0:31] / BE word 0), then splat
++  // word 0 to all four lanes.
++  as_xscvdpspn(dest, src);
++  as_xxspltw(dest, dest, 0);
++}
++
++void MacroAssembler::splatX2(FloatRegister src, FloatRegister dest) {
++  // Splat scalar double to both doubleword lanes.
++  // Scalar value is in register bits[0:63] (BE dword 0).
++  // xxpermdi dm=0: dest = [src.dw0, src.dw0]
++  as_xxpermdi(dest, src, src, 0);
++}
++
++// Helpers: splat Imm32 into SIMD register at various element widths.
++// VMX shift instructions read the shift count from EACH element independently,
++// so the count must be replicated to every byte/halfword/word as appropriate.
++//
++// Fast path for small constants: vspltis{b,h,w} (POWER7+) splats a 5-bit
++// signed immediate to all lanes in 1 insn with no pool entry. For values
++// outside [-16, 15] we fall back to the inline-pool path.
++static void SplatImm8(MacroAssembler& masm, Imm32 imm, FloatRegister dest) {
++  int8_t val = (int8_t)imm.value;
++  if (val >= -16 && val <= 15) {
++    masm.as_vspltisb(dest.encoding() & 31, val);
++    return;
++  }
++  if (HasPOWER9()) {
++    // P9 xxspltib handles the full 8-bit range in 1 insn.
++    masm.as_xxspltib(dest, (uint8_t)val);
++    return;
++  }
++  int8_t bytes[16];
++  for (int i = 0; i < 16; i++) bytes[i] = val;
++  masm.loadConstantSimd128(SimdConstant::CreateX16(bytes), dest);
++}
++
++static void SplatImm16(MacroAssembler& masm, Imm32 imm, FloatRegister dest) {
++  int16_t val = (int16_t)imm.value;
++  if (val >= -16 && val <= 15) {
++    masm.as_vspltish(dest.encoding() & 31, (int8_t)val);
++    return;
++  }
++  int16_t halfs[8];
++  for (int i = 0; i < 8; i++) halfs[i] = val;
++  masm.loadConstantSimd128(SimdConstant::CreateX8(halfs), dest);
++}
++
++static void SplatImm32(MacroAssembler& masm, Imm32 imm, FloatRegister dest) {
++  int32_t val = imm.value;
++  if (val >= -16 && val <= 15) {
++    masm.as_vspltisw(dest.encoding() & 31, (int8_t)val);
++    return;
++  }
++  int32_t words[4] = {val, val, val, val};
++  masm.loadConstantSimd128(SimdConstant::CreateX4(words), dest);
++}
++
++// ===============================================================
++// Extract lane
++
++static void ExtractLaneToGPR(MacroAssembler& masm, uint32_t lane,
++                             FloatRegister src, Register dest,
++                             unsigned laneWidthBytes, unsigned laneWidthBits) {
++  // Extract Wasm lane from vector register to GPR.
++  // Wasm lane K → register byte offset (15 - K*laneWidthBytes).
++  //
++  // Strategy: use mfvsrd to get one 64-bit half of the register, then shift
++  // and mask to isolate the lane.
++  //
++  // mfvsrd gets register bits[0:63] (BE dword 0) = Wasm lanes in the high
++  // half of the register (high-numbered lanes in LE memory order).
++  // For an N-bit lane at Wasm index L:
++  //   If L is in the high dword (L >= 8/laneWidthBytes):
++  //     use mfvsrd; lane is at GPR bit offset laneWidthBits*(L -
++  //     8/laneWidthBytes) from LSB
++  //   Else (L in low dword):
++  //     swap dwords, then mfvsrd; lane is at GPR bit offset laneWidthBits*L
++  //     from LSB
++
++  unsigned lanesPerDword = 8 / laneWidthBytes;
++
++  if (lane >= lanesPerDword) {
++    masm.as_mfvsrd(dest, src);
++    unsigned shift = laneWidthBits * (lane - lanesPerDword);
++    if (shift) {
++      masm.x_srdi(dest, dest, shift);
++    }
++  } else {
++    if (HasPOWER9()) {
++      masm.as_mfvsrld(dest, src);
++    } else {
++      // POWER8: swap dwords to get dw1 into scalar position.
++      // Avoid ScratchSimd128Scope — callers may already hold it.
++      // Use xxpermdi directly on ScratchSimd128Reg (v0/VSR32, non-allocatable).
++      masm.as_xxpermdi(ScratchSimd128Reg, src, src, 2);
++      masm.as_mfvsrd(dest, ScratchSimd128Reg);
++    }
++    unsigned shift = laneWidthBits * lane;
++    if (shift) {
++      masm.x_srdi(dest, dest, shift);
++    }
++  }
++}
++
++void MacroAssembler::unsignedExtractLaneInt8x16(uint32_t lane,
++                                                FloatRegister src,
++                                                Register dest) {
++  MOZ_ASSERT(lane < 16);
++  if (HasPOWER9()) {
++    // vextractub puts VRB.BE_byte[UIM] at VRT.BE_byte[7] with the rest
++    // zeroed; mfvsrd then reads BE bytes 0..7 → low byte of dest, high
++    // bytes already 0. No mask needed.
++    as_vextractub(ScratchSimd128Reg, src, 15 - lane);
++    as_mfvsrd(dest, ScratchSimd128Reg);
++    return;
++  }
++  ExtractLaneToGPR(*this, lane, src, dest, 1, 8);
++  as_rldicl(dest, dest, 0, 56);
++}
++
++void MacroAssembler::unsignedExtractLaneInt16x8(uint32_t lane,
++                                                FloatRegister src,
++                                                Register dest) {
++  MOZ_ASSERT(lane < 8);
++  if (HasPOWER9()) {
++    as_vextractuh(ScratchSimd128Reg, src, 14 - 2 * lane);
++    as_mfvsrd(dest, ScratchSimd128Reg);
++    return;
++  }
++  ExtractLaneToGPR(*this, lane, src, dest, 2, 16);
++  as_rldicl(dest, dest, 0, 48);
++}
++
++void MacroAssembler::extractLaneFloat32x4(uint32_t lane, FloatRegister src,
++                                          FloatRegister dest) {
++  MOZ_ASSERT(lane < 4);
++  // BE word index = 3 - lane. xxextractuw extracts a word by BE byte offset.
++  // BE byte offset of BE word W = W*4. So offset = (3-lane)*4.
++  // xxextractuw puts the extracted word into bits[32:63] of dest (the low
++  // word of the scalar doubleword), then xscvspdpn converts SP→DP.
++  // xxspltw replicates a word into all 4 positions. The scalar SP value
++  // is then at bits[0:31] where xscvspdpn expects it.
++  as_xxspltw(dest, src, 3 - lane);
++  as_xscvspdpn(dest, dest);
++}
++
++void MacroAssembler::extractLaneFloat64x2(uint32_t lane, FloatRegister src,
++                                          FloatRegister dest) {
++  MOZ_ASSERT(lane < 2);
++  if (lane == 0) {
++    // Lane 0 = LE low dword = BE dword 1. Need to swap to scalar position.
++    as_xxpermdi(dest, src, src, 2);
++  } else {
++    // Lane 1 = LE high dword = BE dword 0 = scalar position.
++    if (src != dest) {
++      as_xxlor(dest, src, src);
++    }
++  }
++}
++
++// ===============================================================
++// Replace lane
++
++void MacroAssembler::replaceLaneInt8x16(unsigned lane, Register rhs,
++                                        FloatRegister lhsDest) {
++  MOZ_ASSERT(lane < 16);
++  if (HasPOWER10()) {
++    // 2 insns + 1 GPR scratch: load lane index, vinsbrx (right-indexed
++    // = LE-natural). vinsbrx masks RA & 0xF, so the immediate fits.
++    UseScratchRegisterScope temps(asMasm());
++    Register idx = temps.Acquire();
++    xs_li(idx, int16_t(lane));
++    as_vinsbrx(lhsDest, idx, rhs);
++    return;
++  }
++  if (HasPOWER9()) {
++    // 2 insns + 1 VSR scratch: stage rhs in BE 0..63 of a scratch VSR
++    // (low byte of rhs lands at BE byte 7), then vinsertb copies that
++    // BE byte 7 into lhsDest's BE byte (15 - lane) = wasm lane L.
++    ScratchSimd128Scope scratch(*this);
++    as_mtvsrd(scratch, rhs);
++    as_vinsertb(lhsDest, scratch, 15 - lane);
++    return;
++  }
++  {
++    // POWER8: extract dword, use rldimi to insert byte, write back.
++    // Only needs 1 GPR scratch.
++    UseScratchRegisterScope temps(asMasm());
++    ScratchSimd128Scope scratch128(*this);
++    Register tmp = temps.Acquire();
++    unsigned dword = lane / 8;
++    unsigned byteInDword = lane % 8;
++    if (dword == 1) {
++      as_mfvsrd(tmp, lhsDest);
++    } else {
++      as_xxpermdi(scratch128, lhsDest, lhsDest, 2);
++      as_mfvsrd(tmp, scratch128);
++    }
++    // rldimi RT,RS,SH,MB: insert rotated RS bits into RT at positions
++    // MB..63-SH. Insert rhs byte at bit offset 8*byteInDword from LSB:
++    //   SH = 8*byteInDword, MB = 56 - 8*byteInDword
++    as_rldimi(tmp, rhs, 8 * byteInDword, 56 - 8 * byteInDword);
++    as_mtvsrd(scratch128, tmp);
++    // mtvsrd writes scratch128.dw0 from `tmp` and leaves scratch128.dw1
++    // undefined. Both xxpermdi forms below select scratch128.dw0 only:
++    //   DM=0b01 → [scratch.dw0, lhsDest.dw1]
++    //   DM=0b00 → [lhsDest.dw0, scratch.dw0]
++    // So the undefined dw1 is never read. INVARIANT: any future change
++    // to either DM literal MUST first zero scratch128.dw1 via xxlxor or
++    // adopt a different staging scheme; otherwise reads of dw1 produce
++    // POWER9-zero / POWER8-undefined garbage in the output.
++    if (dword == 1) {
++      as_xxpermdi(lhsDest, scratch128, lhsDest, 1);
++    } else {
++      as_xxpermdi(lhsDest, lhsDest, scratch128, 0);
++    }
++  }
++}
++
++void MacroAssembler::replaceLaneInt16x8(unsigned lane, Register rhs,
++                                        FloatRegister lhsDest) {
++  MOZ_ASSERT(lane < 8);
++  if (HasPOWER10()) {
++    // 2 insns + 1 GPR scratch: lane*2 → byte position, then vinshrx.
++    UseScratchRegisterScope temps(asMasm());
++    Register idx = temps.Acquire();
++    xs_li(idx, int16_t(lane * 2));
++    as_vinshrx(lhsDest, idx, rhs);
++    return;
++  }
++  if (HasPOWER9()) {
++    // 2 insns + 1 VSR scratch: stage rhs in BE 0..63 (low 16 of rhs
++    // lands at BE bytes 6..7), then vinserth copies those two bytes
++    // into lhsDest's BE bytes (14 - 2L)..(15 - 2L) = wasm lane L.
++    ScratchSimd128Scope scratch(*this);
++    as_mtvsrd(scratch, rhs);
++    as_vinserth(lhsDest, scratch, 14 - 2 * lane);
++    return;
++  }
++  {
++    // POWER8: extract dword, rldimi to insert halfword, write back.
++    // Same dw1-undef invariant as replaceLaneInt8x16 above.
++    UseScratchRegisterScope temps(asMasm());
++    ScratchSimd128Scope scratch128(*this);
++    Register tmp = temps.Acquire();
++    unsigned dword = lane / 4;
++    unsigned hwInDword = lane % 4;
++    if (dword == 1) {
++      as_mfvsrd(tmp, lhsDest);
++    } else {
++      as_xxpermdi(scratch128, lhsDest, lhsDest, 2);
++      as_mfvsrd(tmp, scratch128);
++    }
++    as_rldimi(tmp, rhs, 16 * hwInDword, 48 - 16 * hwInDword);
++    as_mtvsrd(scratch128, tmp);
++    if (dword == 1) {
++      as_xxpermdi(lhsDest, scratch128, lhsDest, 1);
++    } else {
++      as_xxpermdi(lhsDest, lhsDest, scratch128, 0);
++    }
++  }
++}
++
++void MacroAssembler::replaceLaneInt32x4(unsigned lane, Register rhs,
++                                        FloatRegister lhsDest) {
++  MOZ_ASSERT(lane < 4);
++  if (HasPOWER10()) {
++    // 1 insn, no scratch VSR. UIM is the BE byte offset.
++    as_vinsw(lhsDest, rhs, (3 - lane) * 4);
++    return;
++  }
++  if (HasPOWER9()) {
++    // POWER9: xxinsertw inserts word from bits[32:63] of XB at BE byte
++    // offset UIM in XT. mtvsrd puts GPR into bits[0:63]; low 32 bits
++    // land at bits[32:63]. BE byte offset of Wasm word lane = (3-lane)*4.
++    ScratchSimd128Scope scratch(*this);
++    as_mtvsrd(scratch, rhs);
++    as_xxinsertw(lhsDest, scratch, (3 - lane) * 4);
++    return;
++  }
++  // POWER8: extract dword, rldimi to insert word, write back.
++  // Modeled on replaceLaneInt16x8 below.
++  UseScratchRegisterScope temps(asMasm());
++  ScratchSimd128Scope scratch128(*this);
++  Register tmp = temps.Acquire();
++  unsigned dword = lane / 2;        // 0 = lanes 0,1; 1 = lanes 2,3.
++  unsigned wordInDword = lane % 2;  // 0 = low LE word; 1 = high LE word.
++  if (dword == 1) {
++    as_mfvsrd(tmp, lhsDest);
++  } else {
++    as_xxpermdi(scratch128, lhsDest, lhsDest, 2);
++    as_mfvsrd(tmp, scratch128);
++  }
++  as_rldimi(tmp, rhs, 32 * wordInDword, 32 - 32 * wordInDword);
++  as_mtvsrd(scratch128, tmp);
++  if (dword == 1) {
++    as_xxpermdi(lhsDest, scratch128, lhsDest, 1);
++  } else {
++    as_xxpermdi(lhsDest, lhsDest, scratch128, 0);
++  }
++}
++
++void MacroAssembler::replaceLaneFloat32x4(unsigned lane, FloatRegister rhs,
++                                          FloatRegister lhsDest) {
++  MOZ_ASSERT(lane < 4);
++  if (HasPOWER9()) {
++    ScratchSimd128Scope scratch(*this);
++    as_xscvdpspn(scratch, rhs);
++    as_xxinsertw(lhsDest, scratch, (3 - lane) * 4);
++    return;
++  }
++  // POWER8: convert double rhs to single (lands in BE bits 0..31 of FPR),
++  // extract bits to a GPR, then route through the integer insert path.
++  UseScratchRegisterScope temps(asMasm());
++  Register rhsBits = temps.Acquire();
++  {
++    ScratchSimd128Scope scratch(*this);
++    as_xscvdpspn(scratch, rhs);
++    as_mfvsrd(rhsBits, scratch);   // single is in high 32 bits of GPR
++    x_srdi(rhsBits, rhsBits, 32);  // single → low 32 bits
++  }
++  // Inline the int-insert sequence (can't call replaceLaneInt32x4 from
++  // here because we're already inside a UseScratchRegisterScope and
++  // need to acquire a separate tmp).
++  ScratchSimd128Scope scratch128(*this);
++  Register tmp = temps.Acquire();
++  unsigned dword = lane / 2;
++  unsigned wordInDword = lane % 2;
++  if (dword == 1) {
++    as_mfvsrd(tmp, lhsDest);
++  } else {
++    as_xxpermdi(scratch128, lhsDest, lhsDest, 2);
++    as_mfvsrd(tmp, scratch128);
++  }
++  as_rldimi(tmp, rhsBits, 32 * wordInDword, 32 - 32 * wordInDword);
++  as_mtvsrd(scratch128, tmp);
++  if (dword == 1) {
++    as_xxpermdi(lhsDest, scratch128, lhsDest, 1);
++  } else {
++    as_xxpermdi(lhsDest, lhsDest, scratch128, 0);
++  }
++}
++
++void MacroAssembler::replaceLaneFloat64x2(unsigned lane, FloatRegister rhs,
++                                          FloatRegister lhsDest) {
++  MOZ_ASSERT(lane < 2);
++  // xxpermdi to place the scalar double into the correct lane.
++  if (lane == 0) {
++    // Replace LE low dword (= dw1). Keep lhsDest dw0 (lane 1).
++    // rhs scalar is in dw0. dm=0b00: [lhsDest.dw0, rhs.dw0]
++    as_xxpermdi(lhsDest, lhsDest, rhs, 0);
++  } else {
++    // Replace LE high dword (= dw0). Keep lhsDest dw1 (lane 0).
++    // rhs scalar is in dw0. dm=0b01: [rhs.dw0, lhsDest.dw1]
++    as_xxpermdi(lhsDest, rhs, lhsDest, 1);
++  }
++}
++
++void MacroAssembler::shuffleInt8x16(const uint8_t lanes[16], FloatRegister rhs,
++                                    FloatRegister lhsDest) {
++  shuffleInt8x16(lanes, lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::shuffleInt8x16(const uint8_t lanes[16], FloatRegister lhs,
++                                    FloatRegister rhs, FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  // PPC64 vperm uses BE byte indices: VRA[0]=MSB, VRA[15]=LSB, VRB[16..31].
++  // Convert Wasm LE lane indices to vperm control: lhs lane N = BE index
++  // (15-N), rhs lane N = BE index (31-N) = (47 - (N+16)).
++  int8_t ctrl[16];
++  for (unsigned i = 0; i < 16; i++) {
++    uint8_t src = lanes[i];
++    if (src < 16) {
++      ctrl[i] = 15 - src;
++    } else {
++      ctrl[i] = 47 - src;
++    }
++  }
++  loadConstantSimd128(SimdConstant::CreateX16(ctrl), scratch);
++  // vperm directly on Simd128 regs.
++  as_vperm(dest.encoding() & 31, lhs.encoding() & 31, rhs.encoding() & 31,
++           scratch.encoding() & 31);
++}
++
++void MacroAssembler::laneSelectSimd128(FloatRegister mask, FloatRegister lhs,
++                                       FloatRegister rhs, FloatRegister dest) {
++  // xxsel: XC=0→XA, XC=1→XB → XT = (XA & ~XC) | (XB & XC)
++  // laneSelect: dest = (lhs & mask) | (rhs & ~mask)
++  // Need XA=rhs, XB=lhs, XC=mask.
++  as_xxsel(dest, rhs, lhs, mask);
++}
++
++void MacroAssembler::interleaveHighInt8x16(FloatRegister lhs, FloatRegister rhs,
++                                           FloatRegister dest) {
++  // On LE, vmrghb(rhs, lhs) gives Wasm interleave_high.
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmrghb), rhs, lhs, dest);
++}
++
++void MacroAssembler::interleaveHighInt16x8(FloatRegister lhs, FloatRegister rhs,
++                                           FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmrghh), rhs, lhs, dest);
++}
++
++void MacroAssembler::interleaveHighInt32x4(FloatRegister lhs, FloatRegister rhs,
++                                           FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmrghw), rhs, lhs, dest);
++}
++
++void MacroAssembler::interleaveHighInt64x2(FloatRegister lhs, FloatRegister rhs,
++                                           FloatRegister dest) {
++  // xxpermdi DM=0: [XA.dw0, XB.dw0] = merge high dwords.
++  // On LE: dw0 = high Wasm lane (lane 1).
++  as_xxpermdi(dest, rhs, lhs, 0);
++}
++
++void MacroAssembler::interleaveLowInt8x16(FloatRegister lhs, FloatRegister rhs,
++                                          FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmrglb), rhs, lhs, dest);
++}
++
++void MacroAssembler::interleaveLowInt16x8(FloatRegister lhs, FloatRegister rhs,
++                                          FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmrglh), rhs, lhs, dest);
++}
++
++void MacroAssembler::interleaveLowInt32x4(FloatRegister lhs, FloatRegister rhs,
++                                          FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmrglw), rhs, lhs, dest);
++}
++
++void MacroAssembler::interleaveLowInt64x2(FloatRegister lhs, FloatRegister rhs,
++                                          FloatRegister dest) {
++  // xxpermdi DM=3: [XA.dw1, XB.dw1] = merge low dwords.
++  as_xxpermdi(dest, rhs, lhs, 3);
++}
++
++void MacroAssembler::concatAndRightShiftSimd128(FloatRegister lhs,
++                                                FloatRegister rhs,
++                                                FloatRegister dest,
++                                                uint32_t shift) {
++  // vsldoi(VRA, VRB, SH) extracts 16 bytes starting at byte SH of the
++  // big-endian concatenation VRA||VRB. Endianness mapping for the Wasm
++  // `v128.shuffle` right-shift-concat semantic:
++  //   Wasm:  result[i] = (i + shift < 16) ? rhs[i + shift]
++  //                                       : lhs[i + shift - 16]
++  //   PPC LE: vsldoi(rhs, lhs, shift) produces exactly that — the LE byte
++  //   layout reverses from BE, so passing (rhs, lhs, shift) here is the LE
++  //   equivalent of (lhs, rhs, 16 - shift) on BE.
++  MOZ_ASSERT(shift < 16);
++  if (shift == 0) {
++    moveSimd128(rhs, dest);
++    return;
++  }
++  // vsldoi VRT,VRA,VRB,SH: result[i] = (VRA||VRB)[SH+i]
++  // Emit vsldoi directly on Simd128 regs (VRA = lhs = high part, VRB =
++  // rhs = low part). The VMX emitter masks `& 31` internally to extract
++  // the 5-bit VR field from the Simd128 encoding.
++  as_vsldoi(dest, lhs, rhs, shift);
++}
++
++void MacroAssembler::leftShiftSimd128(Imm32 count, FloatRegister src,
++                                      FloatRegister dest) {
++  MOZ_ASSERT(count.value < 16);
++  if (count.value == 0) {
++    moveSimd128(src, dest);
++    return;
++  }
++  // vslo shifts left by bytes (count in bits 121-124 of VRB, i.e. byte 15 bits
++  // 1-4). vsl shifts left by bits (count in bits 125-127 of VRB, i.e. byte 15
++  // bits 5-7). For byte shift: splatX4(count*8, scratch), then vslo.
++  ScratchSimd128Scope scratch(*this);
++  SplatImm32(*this, Imm32(count.value * 8), scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vslo), src, scratch, dest);
++}
++
++void MacroAssembler::rightShiftSimd128(Imm32 count, FloatRegister src,
++                                       FloatRegister dest) {
++  MOZ_ASSERT(count.value < 16);
++  if (count.value == 0) {
++    moveSimd128(src, dest);
++    return;
++  }
++  ScratchSimd128Scope scratch(*this);
++  SplatImm32(*this, Imm32(count.value * 8), scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsro), src, scratch, dest);
++}
++
++void MacroAssembler::zeroExtend8x16To16x8(FloatRegister src,
++                                          FloatRegister dest) {
++  // Unsigned widen low: interleave low bytes with zero bytes.
++  // On LE, vmrglb(zero, src) interleaves the low 8 bytes of src with zeros.
++  // Use ScratchSimd128Reg as the zero. Order matters: read src into the
++  // merge BEFORE writing dest (which might alias src). vmrglb reads
++  // vra+vrb, writes vrt — single-cycle issue.
++  ScratchSimd128Scope zero(*this);
++  as_xxlxor(zero, zero, zero);
++  as_vmrglb(dest.encoding() & 31, zero.encoding() & 31, src.encoding() & 31);
++}
++
++void MacroAssembler::zeroExtend8x16To32x4(FloatRegister src,
++                                          FloatRegister dest) {
++  zeroExtend8x16To16x8(src, dest);
++  zeroExtend16x8To32x4(dest, dest);
++}
++
++void MacroAssembler::zeroExtend8x16To64x2(FloatRegister src,
++                                          FloatRegister dest) {
++  zeroExtend8x16To32x4(src, dest);
++  zeroExtend32x4To64x2(dest, dest);
++}
++
++void MacroAssembler::zeroExtend16x8To32x4(FloatRegister src,
++                                          FloatRegister dest) {
++  // Unsigned widen low: interleave low halfwords with zero halfwords.
++  ScratchSimd128Scope zero(*this);
++  as_xxlxor(zero, zero, zero);
++  as_vmrglh(dest.encoding() & 31, zero.encoding() & 31, src.encoding() & 31);
++}
++
++void MacroAssembler::zeroExtend16x8To64x2(FloatRegister src,
++                                          FloatRegister dest) {
++  zeroExtend16x8To32x4(src, dest);
++  zeroExtend32x4To64x2(dest, dest);
++}
++
++void MacroAssembler::zeroExtend32x4To64x2(FloatRegister src,
++                                          FloatRegister dest) {
++  // Unsigned widen low: interleave low words with zero words.
++  ScratchSimd128Scope zero(*this);
++  as_xxlxor(zero, zero, zero);
++  as_vmrglw(dest.encoding() & 31, zero.encoding() & 31, src.encoding() & 31);
++}
++
++void MacroAssembler::reverseInt16x8(FloatRegister src, FloatRegister dest) {
++  const uint8_t lanes[] = {14, 15, 12, 13, 10, 11, 8, 9,
++                           6,  7,  4,  5,  2,  3,  0, 1};
++  shuffleInt8x16(lanes, src, src, dest);
++}
++
++void MacroAssembler::reverseInt32x4(FloatRegister src, FloatRegister dest) {
++  const uint8_t lanes[] = {12, 13, 14, 15, 8, 9, 10, 11,
++                           4,  5,  6,  7,  0, 1, 2,  3};
++  shuffleInt8x16(lanes, src, src, dest);
++}
++
++void MacroAssembler::reverseInt64x2(FloatRegister src, FloatRegister dest) {
++  as_xxpermdi(dest, src, src, 2);
++}
++
++void MacroAssembler::swizzleInt8x16Relaxed(FloatRegister lhs, FloatRegister rhs,
++                                           FloatRegister dest) {
++  swizzleInt8x16(lhs, rhs, dest);
++}
++
++// extMul{Low,High}Int{8x16,16x8} use POWER8+ widening multiplies
++// (vmul{e,o}{s,u}{b,h}) plus a halfword/word merge to map BE-indexed
++// even/odd products into Wasm lane order on PPC64 LE.
++//
++// Lane mapping:
++//   For Low (Wasm lanes from LE bytes/HW 0..N/2-1 = BE 15..N/2):
++//     vmrgl{h,w}(even_products, odd_products) places the right products
++//     at BE result indices, which on LE map to Wasm lanes 0..N/2-1.
++//   For High (Wasm lanes from LE indices N/2..N-1 = BE N/2-1..0):
++//     vmrgh{h,w} takes the upper-half BE indices instead.
++//
++// Aliasing safety: vmul* reads both operands before writing, so
++// `dest = vmulo* lhs, rhs` is safe even when dest aliases lhs/rhs.
++// We use one scratch for the even-product half because vmrgl{h,w}
++// reads dest after the odd multiply.
++
++void MacroAssembler::extMulLowInt8x16(FloatRegister lhs, FloatRegister rhs,
++                                      FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
++  uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
++  as_vmulesb(s, l, r);
++  as_vmulosb(d, l, r);
++  as_vmrglh(d, s, d);
++}
++
++void MacroAssembler::extMulHighInt8x16(FloatRegister lhs, FloatRegister rhs,
++                                       FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
++  uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
++  as_vmulesb(s, l, r);
++  as_vmulosb(d, l, r);
++  as_vmrghh(d, s, d);
++}
++
++void MacroAssembler::unsignedExtMulLowInt8x16(FloatRegister lhs,
++                                              FloatRegister rhs,
++                                              FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
++  uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
++  as_vmuleub(s, l, r);
++  as_vmuloub(d, l, r);
++  as_vmrglh(d, s, d);
++}
++
++void MacroAssembler::unsignedExtMulHighInt8x16(FloatRegister lhs,
++                                               FloatRegister rhs,
++                                               FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
++  uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
++  as_vmuleub(s, l, r);
++  as_vmuloub(d, l, r);
++  as_vmrghh(d, s, d);
++}
++
++void MacroAssembler::extMulLowInt16x8(FloatRegister lhs, FloatRegister rhs,
++                                      FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
++  uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
++  as_vmulesh(s, l, r);
++  as_vmulosh(d, l, r);
++  as_vmrglw(d, s, d);
++}
++
++void MacroAssembler::extMulHighInt16x8(FloatRegister lhs, FloatRegister rhs,
++                                       FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
++  uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
++  as_vmulesh(s, l, r);
++  as_vmulosh(d, l, r);
++  as_vmrghw(d, s, d);
++}
++
++void MacroAssembler::unsignedExtMulLowInt16x8(FloatRegister lhs,
++                                              FloatRegister rhs,
++                                              FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
++  uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
++  as_vmuleuh(s, l, r);
++  as_vmulouh(d, l, r);
++  as_vmrglw(d, s, d);
++}
++
++void MacroAssembler::unsignedExtMulHighInt16x8(FloatRegister lhs,
++                                               FloatRegister rhs,
++                                               FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
++  uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
++  as_vmuleuh(s, l, r);
++  as_vmulouh(d, l, r);
++  as_vmrghw(d, s, d);
++}
++
++// ExtMul{Low,High}Int32x4 use vmul{e,o}{s,u}w (POWER8+) plus xxpermdi
++// to combine the two i64 partial products into Wasm lane order on PPC64
++// LE. xxpermdi accepts the full 6-bit VSR encoding so it works directly
++// on Simd128 regs (encoding 32-63) without any VR staging.
++//
++// Aliasing safe: both vmul* reads complete before the second one writes
++// dest, and xxpermdi reads both inputs before writing.
++
++static void EmitExtMulInt32x4(
++    MacroAssembler& masm, FloatRegister lhs, FloatRegister rhs,
++    FloatRegister dest, void (*mulEven)(Assembler&, uint8_t, uint8_t, uint8_t),
++    void (*mulOdd)(Assembler&, uint8_t, uint8_t, uint8_t), uint8_t dm) {
++  ScratchSimd128Scope scratch(masm);
++  uint8_t l = lhs.encoding() & 31;
++  uint8_t r = rhs.encoding() & 31;
++  uint8_t d = dest.encoding() & 31;
++  uint8_t s = scratch.encoding() & 31;
++  mulEven(static_cast<Assembler&>(masm), s, l, r);
++  mulOdd(static_cast<Assembler&>(masm), d, l, r);
++  masm.as_xxpermdi(dest, scratch, dest, dm);
++}
++
++void MacroAssembler::extMulLowInt32x4(FloatRegister lhs, FloatRegister rhs,
++                                      FloatRegister dest) {
++  EmitExtMulInt32x4(
++      *this, lhs, rhs, dest,
++      [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
++        a.as_vmulesw(t, x, y);
++      },
++      [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
++        a.as_vmulosw(t, x, y);
++      },
++      3);
++}
++
++void MacroAssembler::extMulHighInt32x4(FloatRegister lhs, FloatRegister rhs,
++                                       FloatRegister dest) {
++  EmitExtMulInt32x4(
++      *this, lhs, rhs, dest,
++      [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
++        a.as_vmulesw(t, x, y);
++      },
++      [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
++        a.as_vmulosw(t, x, y);
++      },
++      0);
++}
++
++void MacroAssembler::unsignedExtMulLowInt32x4(FloatRegister lhs,
++                                              FloatRegister rhs,
++                                              FloatRegister dest) {
++  EmitExtMulInt32x4(
++      *this, lhs, rhs, dest,
++      [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
++        a.as_vmuleuw(t, x, y);
++      },
++      [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
++        a.as_vmulouw(t, x, y);
++      },
++      3);
++}
++
++void MacroAssembler::unsignedExtMulHighInt32x4(FloatRegister lhs,
++                                               FloatRegister rhs,
++                                               FloatRegister dest) {
++  EmitExtMulInt32x4(
++      *this, lhs, rhs, dest,
++      [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
++        a.as_vmuleuw(t, x, y);
++      },
++      [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
++        a.as_vmulouw(t, x, y);
++      },
++      0);
++}
++
++void MacroAssembler::q15MulrSatInt16x8(FloatRegister lhs, FloatRegister rhs,
++                                       FloatRegister dest) {
++  // Q15 multiply-round-saturate: vmhraddshs(a, b, zero) computes
++  // saturate((a[i]*b[i] + 0x4000) >> 15) for each halfword.
++  ScratchSimd128Scope scratch(*this);
++  ZeroSimd128(*this, scratch);
++  EmitVmxTernary(
++      *this,
++      [](Assembler& a, uint8_t vrt, uint8_t vra, uint8_t vrb, uint8_t vrc) {
++        a.as_vmhraddshs(vrt, vra, vrb, vrc);
++      },
++      lhs, rhs, scratch, dest);
++}
++
++// neg = 0 - src. Use ScratchSimd128Reg (= VR0, non-allocatable) as the
++// zero source so the register allocator sees no clobbered VRs.
++// 2 insns: xxlxor scratch + vsubuXm dest, scratch, src. vneg{b,h}
++// doesn't exist in any POWER ISA, hence the subtract.
++void MacroAssembler::negInt8x16(FloatRegister src, FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  ZeroSimd128(*this, scratch);
++  as_vsububm(dest.encoding() & 31, scratch.encoding() & 31,
++             src.encoding() & 31);
++}
++
++void MacroAssembler::negInt16x8(FloatRegister src, FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  ZeroSimd128(*this, scratch);
++  as_vsubuhm(dest.encoding() & 31, scratch.encoding() & 31,
++             src.encoding() & 31);
++}
++
++void MacroAssembler::negInt32x4(FloatRegister src, FloatRegister dest) {
++  if (HasPOWER9()) {
++    EmitVmxUnary(
++        *this,
++        [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vnegw(vrt, vrb); },
++        src, dest);
++    return;
++  }
++  // POWER8 fallback: 0 - src via ScratchSimd128Reg (VR0).
++  ScratchSimd128Scope scratch(*this);
++  ZeroSimd128(*this, scratch);
++  as_vsubuwm(dest.encoding() & 31, scratch.encoding() & 31,
++             src.encoding() & 31);
++}
++
++void MacroAssembler::negInt64x2(FloatRegister src, FloatRegister dest) {
++  if (HasPOWER9()) {
++    EmitVmxUnary(
++        *this,
++        [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vnegd(vrt, vrb); },
++        src, dest);
++    return;
++  }
++  // POWER8 fallback: 0 - src via ScratchSimd128Reg (VR0).
++  ScratchSimd128Scope scratch(*this);
++  ZeroSimd128(*this, scratch);
++  as_vsubudm(dest.encoding() & 31, scratch.encoding() & 31,
++             src.encoding() & 31);
++}
++#undef DEF_NEG_INTNxM_VSUB
++
++void MacroAssembler::unsignedAddSatInt8x16(FloatRegister lhs, FloatRegister rhs,
++                                           FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vaddubs), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedAddSatInt16x8(FloatRegister lhs, FloatRegister rhs,
++                                           FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vadduhs), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedSubSatInt8x16(FloatRegister lhs, FloatRegister rhs,
++                                           FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsububs), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedSubSatInt16x8(FloatRegister lhs, FloatRegister rhs,
++                                           FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsubuhs), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedMinInt8x16(FloatRegister lhs, FloatRegister rhs,
++                                        FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vminub), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedMinInt16x8(FloatRegister lhs, FloatRegister rhs,
++                                        FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vminuh), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedMinInt32x4(FloatRegister lhs, FloatRegister rhs,
++                                        FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vminuw), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedMaxInt8x16(FloatRegister lhs, FloatRegister rhs,
++                                        FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmaxub), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedMaxInt16x8(FloatRegister lhs, FloatRegister rhs,
++                                        FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmaxuh), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedMaxInt32x4(FloatRegister lhs, FloatRegister rhs,
++                                        FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmaxuw), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedAverageInt8x16(FloatRegister lhs,
++                                            FloatRegister rhs,
++                                            FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vavgub), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedAverageInt16x8(FloatRegister lhs,
++                                            FloatRegister rhs,
++                                            FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vavguh), lhs, rhs, dest);
++}
++
++// abs(x) = max(x, -x) per signed lane. No vabs{b,h,w,d} exists in any ISA.
++// vneg{w,d} exists only on POWER9.
++// We use ScratchSimd128Reg as a temp for -src. Order matters: compute
++// -src into temp first (reads src), then max(src, temp) into dest (reads
++// src + temp, writes dest). Safe even when dest == src because src is
++// read before dest is written by vmaxsX.
++
++void MacroAssembler::absInt8x16(FloatRegister src, FloatRegister dest) {
++  ScratchSimd128Scope tmp(*this);
++  as_xxlxor(tmp, tmp, tmp);  // tmp = 0
++  as_vsububm(tmp.encoding() & 31, tmp.encoding() & 31,
++             src.encoding() & 31);  // tmp = -src
++  as_vmaxsb(dest.encoding() & 31, src.encoding() & 31,
++            tmp.encoding() & 31);  // dest = max(src, -src)
++}
++
++void MacroAssembler::absInt16x8(FloatRegister src, FloatRegister dest) {
++  ScratchSimd128Scope tmp(*this);
++  as_xxlxor(tmp, tmp, tmp);
++  as_vsubuhm(tmp.encoding() & 31, tmp.encoding() & 31, src.encoding() & 31);
++  as_vmaxsh(dest.encoding() & 31, src.encoding() & 31, tmp.encoding() & 31);
++}
++
++void MacroAssembler::absInt32x4(FloatRegister src, FloatRegister dest) {
++  ScratchSimd128Scope tmp(*this);
++  if (HasPOWER9()) {
++    as_vnegw(tmp.encoding() & 31, src.encoding() & 31);  // tmp = -src
++  } else {
++    as_xxlxor(tmp, tmp, tmp);
++    as_vsubuwm(tmp.encoding() & 31, tmp.encoding() & 31, src.encoding() & 31);
++  }
++  as_vmaxsw(dest.encoding() & 31, src.encoding() & 31, tmp.encoding() & 31);
++}
++
++void MacroAssembler::absInt64x2(FloatRegister src, FloatRegister dest) {
++  ScratchSimd128Scope tmp(*this);
++  if (HasPOWER9()) {
++    as_vnegd(tmp.encoding() & 31, src.encoding() & 31);  // tmp = -src
++  } else {
++    as_xxlxor(tmp, tmp, tmp);
++    as_vsubudm(tmp.encoding() & 31, tmp.encoding() & 31, src.encoding() & 31);
++  }
++  as_vmaxsd(dest.encoding() & 31, src.encoding() & 31, tmp.encoding() & 31);
++}
++
++void MacroAssembler::leftShiftInt8x16(Imm32 count, FloatRegister src,
++                                      FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  SplatImm8(*this, count, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vslb), src, scratch, dest);
++}
++
++void MacroAssembler::leftShiftInt16x8(Imm32 count, FloatRegister src,
++                                      FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  SplatImm16(*this, count, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vslh), src, scratch, dest);
++}
++
++void MacroAssembler::leftShiftInt32x4(Imm32 count, FloatRegister src,
++                                      FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  SplatImm32(*this, count, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vslw), src, scratch, dest);
++}
++
++void MacroAssembler::leftShiftInt64x2(Imm32 count, FloatRegister src,
++                                      FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  SplatImm32(*this, count, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsld), src, scratch, dest);
++}
++
++void MacroAssembler::rightShiftInt8x16(Imm32 count, FloatRegister src,
++                                       FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  SplatImm8(*this, count, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrab), src, scratch, dest);
++}
++
++void MacroAssembler::unsignedRightShiftInt8x16(Imm32 count, FloatRegister src,
++                                               FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  SplatImm8(*this, count, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrb), src, scratch, dest);
++}
++
++void MacroAssembler::rightShiftInt16x8(Imm32 count, FloatRegister src,
++                                       FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  SplatImm16(*this, count, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrah), src, scratch, dest);
++}
++
++void MacroAssembler::unsignedRightShiftInt16x8(Imm32 count, FloatRegister src,
++                                               FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  SplatImm16(*this, count, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrh), src, scratch, dest);
++}
++
++void MacroAssembler::rightShiftInt32x4(Imm32 count, FloatRegister src,
++                                       FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  SplatImm32(*this, count, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsraw), src, scratch, dest);
++}
++
++void MacroAssembler::unsignedRightShiftInt32x4(Imm32 count, FloatRegister src,
++                                               FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  SplatImm32(*this, count, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrw), src, scratch, dest);
++}
++
++void MacroAssembler::rightShiftInt64x2(Imm32 count, FloatRegister src,
++                                       FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  SplatImm32(*this, count, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrad), src, scratch, dest);
++}
++
++void MacroAssembler::unsignedRightShiftInt64x2(Imm32 count, FloatRegister src,
++                                               FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  SplatImm32(*this, count, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrd), src, scratch, dest);
++}
++
++void MacroAssembler::bitwiseAndSimd128(FloatRegister rhs,
++                                       FloatRegister lhsDest) {
++  as_xxland(lhsDest, lhsDest, rhs);
++}
++
++void MacroAssembler::bitwiseAndSimd128(FloatRegister lhs, FloatRegister rhs,
++                                       FloatRegister dest) {
++  as_xxland(dest, lhs, rhs);
++}
++
++void MacroAssembler::bitwiseOrSimd128(FloatRegister rhs,
++                                      FloatRegister lhsDest) {
++  as_xxlor(lhsDest, lhsDest, rhs);
++}
++
++void MacroAssembler::bitwiseOrSimd128(FloatRegister lhs, FloatRegister rhs,
++                                      FloatRegister dest) {
++  as_xxlor(dest, lhs, rhs);
++}
++
++void MacroAssembler::bitwiseXorSimd128(FloatRegister rhs,
++                                       FloatRegister lhsDest) {
++  as_xxlxor(lhsDest, lhsDest, rhs);
++}
++
++void MacroAssembler::bitwiseXorSimd128(FloatRegister lhs, FloatRegister rhs,
++                                       FloatRegister dest) {
++  as_xxlxor(dest, lhs, rhs);
++}
++
++void MacroAssembler::bitwiseNotSimd128(FloatRegister src, FloatRegister dest) {
++  as_xxlnor(dest, src, src);
++}
++
++void MacroAssembler::bitwiseNotAndSimd128(FloatRegister rhs,
++                                          FloatRegister lhsDest) {
++  // notand(lhs, rhs) = ~lhs & rhs = xxlandc(rhs, lhs)
++  as_xxlandc(lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::anyTrueSimd128(FloatRegister src, Register dest) {
++  // vcmpequd. (POWER8+) against zero sets CR6:
++  //   - CR6.LT (BE bit 24) = 1 iff the per-lane result is all-1s, i.e.
++  //     every doubleword of src equals zero (= src is all-zero).
++  //   - CR6.EQ (BE bit 26) = 1 iff no lane was equal (= any nonzero).
++  // any-true = !all-zero = !CR6.LT.
++  ScratchSimd128Scope scratch(*this);
++  uint8_t s = scratch.encoding() & 31;
++  as_xxlxor(scratch, scratch, scratch);
++  as_vcmpequd_rc(s, src.encoding() & 31, s);
++  if (HasPOWER10()) {
++    // setbcr materialises (CR[BI] == 0) ? 1 : 0 directly into dest.
++    // dest = (CR6.LT == 0) = "not all-zero" = any-true.
++    as_setbcr(dest, Assembler::LessThan, cr6);
++    return;
++  }
++  as_mfocrf(dest, cr6);
++  // CR6.LT is at BE bit 24 of the GPR. rlwinm sh=25 rotates left 25:
++  // bit (24 - 25) mod 32 = 31 (LSB). Mask 31..31 keeps just bit 31.
++  as_rlwinm(dest, dest, 25, 31, 31);
++  as_xori(dest, dest, 1);
++}
++
++// vcmpequX. against zero sets CR6: LT = all input lanes were zero,
++// EQ = no input lane was zero. The latter is exactly "all-true".
++// mfocrf places CR6 at bits 24-27 of the low 32-bit half (LT=24, EQ=26).
++// rlwinm rd,rd,27,31,31 extracts bit 26 (CR6.EQ) right-justified.
++template <typename VmxCmpRcFn>
++static void EmitAllTrueInt(MacroAssembler& masm, FloatRegister src,
++                           Register dest, VmxCmpRcFn vmxCmpRc) {
++  ScratchSimd128Scope scratch(masm);
++  ZeroSimd128(masm, scratch);
++  uint8_t s = scratch.encoding() & 31;
++  vmxCmpRc(static_cast<Assembler&>(masm), s, src.encoding() & 31, s);
++  if (HasPOWER10()) {
++    // setbc materialises CR6.EQ directly into dest (1 insn vs the 2-insn
++    // mfocrf + rlwinm extract). Already wired in ma_cmp_set.
++    masm.as_setbc(dest, Assembler::Equal, cr6);
++    return;
++  }
++  masm.as_mfocrf(dest, cr6);
++  masm.as_rlwinm(dest, dest, 27, 31, 31);
++}
++
++void MacroAssembler::allTrueInt8x16(FloatRegister src, Register dest) {
++  EmitAllTrueInt(*this, src, dest,
++                 [](Assembler& a, uint8_t t, uint8_t r, uint8_t b) {
++                   a.as_vcmpequb_rc(t, r, b);
++                 });
++}
++
++void MacroAssembler::allTrueInt16x8(FloatRegister src, Register dest) {
++  EmitAllTrueInt(*this, src, dest,
++                 [](Assembler& a, uint8_t t, uint8_t r, uint8_t b) {
++                   a.as_vcmpequh_rc(t, r, b);
++                 });
++}
++
++void MacroAssembler::allTrueInt32x4(FloatRegister src, Register dest) {
++  EmitAllTrueInt(*this, src, dest,
++                 [](Assembler& a, uint8_t t, uint8_t r, uint8_t b) {
++                   a.as_vcmpequw_rc(t, r, b);
++                 });
++}
++
++void MacroAssembler::allTrueInt64x2(FloatRegister src, Register dest) {
++  EmitAllTrueInt(*this, src, dest,
++                 [](Assembler& a, uint8_t t, uint8_t r, uint8_t b) {
++                   a.as_vcmpequd_rc(t, r, b);
++                 });
++}
++
++void MacroAssembler::compareInt8x16(Assembler::Condition cond,
++                                    FloatRegister rhs, FloatRegister lhsDest) {
++  compareInt8x16(cond, lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::compareInt8x16(Assembler::Condition cond,
++                                    FloatRegister lhs, FloatRegister rhs,
++                                    FloatRegister dest) {
++  if (cond == Assembler::NotEqual && HasPOWER9()) {
++    EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vcmpneb), lhs, rhs, dest);
++    return;
++  }
++  EmitVmxCompare(*this, cond, lhs, rhs, dest, VMX_BINARY_WRAPPER(vcmpequb),
++                 VMX_BINARY_WRAPPER(vcmpgtsb), VMX_BINARY_WRAPPER(vcmpgtub));
++}
++
++void MacroAssembler::compareInt16x8(Assembler::Condition cond,
++                                    FloatRegister rhs, FloatRegister lhsDest) {
++  compareInt16x8(cond, lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::compareInt16x8(Assembler::Condition cond,
++                                    FloatRegister lhs, FloatRegister rhs,
++                                    FloatRegister dest) {
++  if (cond == Assembler::NotEqual && HasPOWER9()) {
++    EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vcmpneh), lhs, rhs, dest);
++    return;
++  }
++  EmitVmxCompare(*this, cond, lhs, rhs, dest, VMX_BINARY_WRAPPER(vcmpequh),
++                 VMX_BINARY_WRAPPER(vcmpgtsh), VMX_BINARY_WRAPPER(vcmpgtuh));
++}
++
++void MacroAssembler::compareInt32x4(Assembler::Condition cond,
++                                    FloatRegister rhs, FloatRegister lhsDest) {
++  compareInt32x4(cond, lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::compareInt32x4(Assembler::Condition cond,
++                                    FloatRegister lhs, FloatRegister rhs,
++                                    FloatRegister dest) {
++  if (cond == Assembler::NotEqual && HasPOWER9()) {
++    EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vcmpnew), lhs, rhs, dest);
++    return;
++  }
++  EmitVmxCompare(*this, cond, lhs, rhs, dest, VMX_BINARY_WRAPPER(vcmpequw),
++                 VMX_BINARY_WRAPPER(vcmpgtsw), VMX_BINARY_WRAPPER(vcmpgtuw));
++}
++
++void MacroAssembler::compareFloat32x4(Assembler::Condition cond,
++                                      FloatRegister rhs,
++                                      FloatRegister lhsDest) {
++  compareFloat32x4(cond, lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::compareFloat32x4(Assembler::Condition cond,
++                                      FloatRegister lhs, FloatRegister rhs,
++                                      FloatRegister dest) {
++  switch (cond) {
++    case Assembler::Equal:
++      as_xvcmpeqsp(dest, lhs, rhs);
++      break;
++    case Assembler::NotEqual:
++      as_xvcmpeqsp(dest, lhs, rhs);
++      bitwiseNotSimd128(dest, dest);
++      break;
++    case Assembler::GreaterThan:
++      as_xvcmpgtsp(dest, lhs, rhs);
++      break;
++    case Assembler::GreaterThanOrEqual:
++      as_xvcmpgesp(dest, lhs, rhs);
++      break;
++    case Assembler::LessThan:
++      as_xvcmpgtsp(dest, rhs, lhs);
++      break;
++    case Assembler::LessThanOrEqual:
++      as_xvcmpgesp(dest, rhs, lhs);
++      break;
++    default:
++      MOZ_CRASH("Unexpected SIMD float condition");
++  }
++}
++
++void MacroAssembler::compareFloat64x2(Assembler::Condition cond,
++                                      FloatRegister rhs,
++                                      FloatRegister lhsDest) {
++  compareFloat64x2(cond, lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::compareFloat64x2(Assembler::Condition cond,
++                                      FloatRegister lhs, FloatRegister rhs,
++                                      FloatRegister dest) {
++  switch (cond) {
++    case Assembler::Equal:
++      as_xvcmpeqdp(dest, lhs, rhs);
++      break;
++    case Assembler::NotEqual:
++      as_xvcmpeqdp(dest, lhs, rhs);
++      bitwiseNotSimd128(dest, dest);
++      break;
++    case Assembler::GreaterThan:
++      as_xvcmpgtdp(dest, lhs, rhs);
++      break;
++    case Assembler::GreaterThanOrEqual:
++      as_xvcmpgedp(dest, lhs, rhs);
++      break;
++    case Assembler::LessThan:
++      as_xvcmpgtdp(dest, rhs, lhs);
++      break;
++    case Assembler::LessThanOrEqual:
++      as_xvcmpgedp(dest, rhs, lhs);
++      break;
++    default:
++      MOZ_CRASH("Unexpected SIMD float condition");
++  }
++}
++
++void MacroAssembler::negFloat32x4(FloatRegister src, FloatRegister dest) {
++  as_xvnegsp(dest, src);
++}
++
++void MacroAssembler::negFloat64x2(FloatRegister src, FloatRegister dest) {
++  as_xvnegdp(dest, src);
++}
++
++void MacroAssembler::absFloat32x4(FloatRegister src, FloatRegister dest) {
++  as_xvabssp(dest, src);
++}
++
++void MacroAssembler::absFloat64x2(FloatRegister src, FloatRegister dest) {
++  as_xvabsdp(dest, src);
++}
++
++// Per spec:
++//   result[k] = (s|u)ext_widen(src[2k]) + (s|u)ext_widen(src[2k+1])
++// POWER lacks pairwise multiply-add. Emulate via vmulX{e,o}X(src, splat(1))
++// + vadd. Both vmuls need `src` AND `splat(1)` available simultaneously.
++//
++// Available SIMD slots without involving Lowering:
++//   - ScratchSimd128Reg (VR0, non-allocatable)
++//   - dest, src
++// That's 3 regs when dest != src — enough for {src, splat, intermediate}.
++// When dest == src we stash src and the even product to the 288-byte ELFv2
++// red zone and rebuild splat(1).
++//
++// (Earlier implementations of these helpers routed through hardcoded
++// VR1/VR2/VR3 via xxlor_vsr — faster but stomped allocator-managed VRs
++// and silently corrupted any live wasm v128 the allocator had placed
++// there. ScratchSimd128Reg + red-zone stash is the safe contract.)
++// Always-safe pattern: stash src to red zone so dest can be freely overwritten,
++// stash even to red zone after first vmul so we can rebuild splat(1) for the
++// second vmul. The splat-of-1 is now `vspltis{b,h}` (5-bit signed immediate
++// splat) — 1 insn vs the 3-insn movePtr+mtvsrd+vsplt sequence the previous
++// path used.
++// Pattern: stash src to red zone slot 0 so dest can be freely overwritten;
++// vmul-even (signed/unsigned) of src with splat(1) produces sign/zero-extended
++// even-lane products into dest; stash that to slot 1 and rebuild scratch=src
++// (slot 0) and dest=splat(1); vmul-odd produces the odd products; restore
++// even from slot 1 and pairwise-add.
++void MacroAssembler::extAddPairwiseInt8x16(FloatRegister src,
++                                           FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  uint8_t s = scratch.encoding() & 31;
++  uint8_t srcEnc = src.encoding() & 31;
++  uint8_t destEnc = dest.encoding() & 31;
++  RedZoneStashSimd128(*this, src, 0);
++  as_vspltisb(s, 1);
++  as_vmulesb(destEnc, srcEnc, s);
++  RedZoneStashSimd128(*this, dest, 1);
++  RedZoneRestoreSimd128(*this, 0, scratch);
++  as_vspltisb(destEnc, 1);
++  as_vmulosb(destEnc, s, destEnc);
++  RedZoneRestoreSimd128(*this, 1, scratch);
++  as_vadduhm(destEnc, destEnc, s);
++}
++
++void MacroAssembler::unsignedExtAddPairwiseInt8x16(FloatRegister src,
++                                                   FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  uint8_t s = scratch.encoding() & 31;
++  uint8_t srcEnc = src.encoding() & 31;
++  uint8_t destEnc = dest.encoding() & 31;
++  RedZoneStashSimd128(*this, src, 0);
++  as_vspltisb(s, 1);
++  as_vmuleub(destEnc, srcEnc, s);
++  RedZoneStashSimd128(*this, dest, 1);
++  RedZoneRestoreSimd128(*this, 0, scratch);
++  as_vspltisb(destEnc, 1);
++  as_vmuloub(destEnc, s, destEnc);
++  RedZoneRestoreSimd128(*this, 1, scratch);
++  as_vadduhm(destEnc, destEnc, s);
++}
++
++// vmsumshm/vmsumuhm collapse the i16x8 → i32x4 pairwise-add into a single
++// multiply-sum: VT.i32[k] = VRA.i16[2k]*VRB.i16[2k] +
++// VRA.i16[2k+1]*VRB.i16[2k+1]
++// + VRC.i32[k]. With VRB = splat(1) and VRC = 0 this is exactly the wasm
++// i32x4.extadd_pairwise_i16x8_{s,u} contract. 3 insns when dest != src;
++// LWasmUnarySimd128 uses useRegisterAtStart so dest may alias src — in that
++// case we put splat(1) into scratch (preserving src in dest) and use a
++// red-zone slot for the zero VRC operand.
++void MacroAssembler::extAddPairwiseInt16x8(FloatRegister src,
++                                           FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  if (dest != src) {
++    as_xxlxor(scratch, scratch, scratch);  // scratch = 0 (VRC addend)
++    as_vspltish(dest.encoding() & 31, 1);  // dest = splat(1) (VRB multiplier)
++    as_vmsumshm(dest.encoding() & 31, src.encoding() & 31, dest.encoding() & 31,
++                scratch.encoding() & 31);
++    return;
++  }
++  // dest == src: load splat(1) into scratch instead, stash zero to the red
++  // zone, restore zero into scratch after the splat is consumed... actually
++  // simpler: use vmule/vmulo + vadd trio with red zone. Same shape as the
++  // pre-vmsumshm fallback for i8x16.
++  uint8_t s = scratch.encoding() & 31;
++  uint8_t srcEnc = src.encoding() & 31;
++  uint8_t destEnc = dest.encoding() & 31;
++  RedZoneStashSimd128(*this, src, 0);
++  as_vspltish(s, 1);
++  as_vmulesh(destEnc, srcEnc, s);
++  RedZoneStashSimd128(*this, dest, 1);
++  RedZoneRestoreSimd128(*this, 0, scratch);
++  as_vspltish(destEnc, 1);
++  as_vmulosh(destEnc, s, destEnc);
++  RedZoneRestoreSimd128(*this, 1, scratch);
++  as_vadduwm(destEnc, destEnc, s);
++}
++
++void MacroAssembler::unsignedExtAddPairwiseInt16x8(FloatRegister src,
++                                                   FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  if (dest != src) {
++    as_xxlxor(scratch, scratch, scratch);
++    as_vspltish(dest.encoding() & 31, 1);
++    as_vmsumuhm(dest.encoding() & 31, src.encoding() & 31, dest.encoding() & 31,
++                scratch.encoding() & 31);
++    return;
++  }
++  uint8_t s = scratch.encoding() & 31;
++  uint8_t srcEnc = src.encoding() & 31;
++  uint8_t destEnc = dest.encoding() & 31;
++  RedZoneStashSimd128(*this, src, 0);
++  as_vspltish(s, 1);
++  as_vmuleuh(destEnc, srcEnc, s);
++  RedZoneStashSimd128(*this, dest, 1);
++  RedZoneRestoreSimd128(*this, 0, scratch);
++  as_vspltish(destEnc, 1);
++  as_vmulouh(destEnc, s, destEnc);
++  RedZoneRestoreSimd128(*this, 1, scratch);
++  as_vadduwm(destEnc, destEnc, s);
++}
++
++void MacroAssembler::sqrtFloat32x4(FloatRegister src, FloatRegister dest) {
++  as_xvsqrtsp(dest, src);
++}
++
++void MacroAssembler::sqrtFloat64x2(FloatRegister src, FloatRegister dest) {
++  as_xvsqrtdp(dest, src);
++}
++
++void MacroAssembler::convertInt32x4ToFloat32x4(FloatRegister src,
++                                               FloatRegister dest) {
++  as_xvcvsxwsp(dest, src);
++}
++
++void MacroAssembler::unsignedConvertInt32x4ToFloat32x4(FloatRegister src,
++                                                       FloatRegister dest) {
++  as_xvcvuxwsp(dest, src);
++}
++
++// i32x4 (low 2 lanes) → f64x2. Wasm `f64x2.convert_low_i32x4_{s,u}`.
++// xvcv{s,u}xwdp converts BE word 0 and BE word 2 of source to doubles in
++// BE dwords 0 and 1. vmrglw places src.word_BE[2,3] at the read positions,
++// matching the f32→f64 promote shape:
++//   vmrglw    scratch, src, src    ; BE words 2,3 of src → BE words 0,2 of
++//   scratch xvcv*xwdp dest, scratch        ; convert both, place in BE dwords
++//   0,1
++// Output BE dwords land as [convert(input lane 1), convert(input lane 0)],
++// which on PPC64LE storage IS the wasm output layout.
++//
++// 2 insns each, single ScratchSimd128 scope, no GPR or FPR scratch.
++// All ops POWER7+. dest==src aliasing safe (vmrglw consumes src into
++// scratch before dest is written).
++void MacroAssembler::convertInt32x4ToFloat64x2(FloatRegister src,
++                                               FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  as_vmrglw(scratch.encoding() & 31, src.encoding() & 31, src.encoding() & 31);
++  as_xvcvsxwdp(dest, scratch);
++}
++
++void MacroAssembler::unsignedConvertInt32x4ToFloat64x2(FloatRegister src,
++                                                       FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  as_vmrglw(scratch.encoding() & 31, src.encoding() & 31, src.encoding() & 31);
++  as_xvcvuxwdp(dest, scratch);
++}
++
++void MacroAssembler::truncSatFloat32x4ToInt32x4(FloatRegister src,
++                                                FloatRegister dest) {
++  // xvcvspsxws gives INT32_MIN for NaN, but Wasm requires 0.
++  ScratchSimd128Scope scratch(*this);
++  as_xvcmpeqsp(scratch, src, src);  // ~0 for non-NaN, 0 for NaN
++  as_xvcvspsxws(dest, src);
++  as_xxland(dest, dest, scratch);  // zero NaN lanes
++}
++
++// Pack the two "interesting" 32-bit results that xvcv*xws / xvcvdpsp leaves
++// at scratch.word_BE[0] (= A) and scratch.word_BE[2] (= B) into a zeroed dest
++// as dest.word_BE = [0, 0, A, B]. This is the layout wasm requires for
++// f64x2 → {i32x4 trunc_sat, f32x4 demote}. Writes dest, consumes scratch.
++//
++// POWER9 path (4 insns) uses xxinsertw/xxextractuw. POWER8 path (7 insns)
++// goes via two GPR round-trips: extract A and B with mfvsrd, splice them
++// into a single dword with rldimi, mtvsrd back into a SIMD reg, and
++// xxpermdi the result into dest.dw1 while keeping dest.dw0 zero.
++static inline void PackTwoWordsToLowHalf(MacroAssembler& masm,
++                                         FloatRegister scratch,
++                                         FloatRegister dest) {
++  if (HasPOWER9()) {
++    masm.as_xxinsertw(dest, scratch,
++                      8);  // dest.word_BE[2] ← scratch.word_BE[1] (= A)
++    masm.as_xxextractuw(scratch, scratch,
++                        8);  // scratch.word_BE[1] ← scratch.word_BE[2] (= B)
++    masm.as_xxinsertw(dest, scratch,
++                      12);  // dest.word_BE[3] ← scratch.word_BE[1] (= B)
++    return;
++  }
++  // POWER8: xxinsertw/xxextractuw are ISA 3.0. Take a GPR detour instead.
++  // scratch.dw_BE[0] = (A << 32) | A, scratch.dw_BE[1] = (B << 32) | B.
++  UseScratchRegisterScope temps(masm);
++  Register tmpA = temps.Acquire();
++  Register tmpB = temps.Acquire();
++  masm.as_mfvsrd(tmpA, scratch);  // tmpA = (A << 32) | A
++  masm.as_xxpermdi(scratch, scratch, scratch,
++                   2);            // swap dwords: now dw0 = (B<<32)|B
++  masm.as_mfvsrd(tmpB, scratch);  // tmpB = (B << 32) | B
++  masm.x_srdi(tmpA, tmpA, 32);    // tmpA = 0x00000000_AAAAAAAA
++  masm.as_rldimi(tmpB, tmpA, 32,
++                 0);              // tmpB[0..31] = A; tmpB[32..63] = B (kept)
++  masm.as_mtvsrd(scratch, tmpB);  // scratch.dw_BE[0] = (A << 32) | B; dw1 = 0
++  masm.as_xxpermdi(dest, dest, scratch,
++                   0);  // dest = {dest.dw0=0, scratch.dw0} = [0, 0, A, B]
++}
++
++// fctiwz / fcmpu / fctiduz are X-form scalar FP instructions that only
++// encode 5-bit FRT/FRB fields, so emitting them on a Simd128 reg
++// (encoding 32+) would corrupt the opcode. Bridge through
++// ScratchDoubleReg (FPR f0) for the conversion. Extract both lanes' GPR
++// results before writing dest so that dest == src is safe.
++//
++// Avoid replaceLaneInt32x4 on the tail: on POWER8 it needs an extra
++// GPR scratch, but r11 and r12 are already held as a/b here. Pack both
++// int32s into `a` with rldimi, transfer via mtvsrd, then xxpermdi the
++// DWs into the low half so wasm lane 0 (BE W3) holds a, lane 1 (W2) b.
++void MacroAssembler::truncSatFloat64x2ToInt32x4(FloatRegister src,
++                                                FloatRegister dest,
++                                                FloatRegister temp) {
++  // Wasm `i32x4.trunc_sat_f64x2_s_zero`. xvcvdpsxws saturates to INT32_MIN
++  // on overflow/NaN (per ISA); wasm requires NaN → 0, so a per-dword NaN
++  // mask via xvcmpeqdp clamps NaN lanes to 0 before laying out the result.
++  // Output BE word positions need wasm lane order: lane 1 → BE word 2,
++  // lane 0 → BE word 3. xvcvdpsxws lands its results at BE words 0 and 2
++  // (with replication into 1/3); PackTwoWordsToLowHalf moves them into
++  // the right positions while zeroing the rest.
++  // dest==src safe: src is consumed by xvcvdpsxws and xvcmpeqdp before
++  // dest is zeroed.
++  ScratchSimd128Scope scratch(*this);
++  as_xvcvdpsxws(scratch, src);
++  as_xvcmpeqdp(dest, src,
++               src);  // NaN-mask: 0xFF...F per dword for non-NaN, 0 for NaN
++  as_xxland(scratch, scratch, dest);
++  as_xxlxor(dest, dest, dest);
++  PackTwoWordsToLowHalf(*this, scratch, dest);
++}
++
++void MacroAssembler::unsignedTruncSatFloat64x2ToInt32x4(FloatRegister src,
++                                                        FloatRegister dest,
++                                                        FloatRegister temp) {
++  // Wasm `i32x4.trunc_sat_f64x2_u_zero`. xvcvdpuxws semantics already
++  // match the wasm spec without any masking: NaN → 0, negative → 0,
++  // positive overflow → UINT32_MAX. So no NaN mask needed; just position
++  // the saturated results into BE words 2,3 with zeros at words 0,1.
++  // dest==src safe: src consumed by xvcvdpuxws before dest is zeroed.
++  ScratchSimd128Scope scratch(*this);
++  as_xvcvdpuxws(scratch, src);
++  as_xxlxor(dest, dest, dest);
++  PackTwoWordsToLowHalf(*this, scratch, dest);
++}
++
++void MacroAssembler::truncFloat32x4ToInt32x4Relaxed(FloatRegister src,
++                                                    FloatRegister dest) {
++  truncSatFloat32x4ToInt32x4(src, dest);
++}
++
++void MacroAssembler::unsignedTruncFloat32x4ToInt32x4Relaxed(
++    FloatRegister src, FloatRegister dest) {
++  unsignedTruncSatFloat32x4ToInt32x4(src, dest);
++}
++
++void MacroAssembler::truncFloat64x2ToInt32x4Relaxed(FloatRegister src,
++                                                    FloatRegister dest) {
++  truncSatFloat64x2ToInt32x4(src, dest, ScratchSimd128Reg);
++}
++
++void MacroAssembler::unsignedTruncFloat64x2ToInt32x4Relaxed(
++    FloatRegister src, FloatRegister dest) {
++  unsignedTruncSatFloat64x2ToInt32x4(src, dest, ScratchSimd128Reg);
++}
++
++// f64x2 → f32x4 (low 2 lanes; high lanes zero). Wasm `f32x4.demote_f64x2_zero`.
++// xvcvdpsp converts both doubles in one shot, replicating each result across
++// its dword: BE word lanes = [s(in.dw0), s(in.dw0), s(in.dw1), s(in.dw1)].
++// On PPC64LE wasm storage (lxvx-loaded), input.dw_BE[0] = wasm lane 1 and
++// input.dw_BE[1] = wasm lane 0, so we get [s(l1), s(l1), s(l0), s(l0)] in
++// BE word order. We then zero dest and pack s(l1) into BE word 2 (wasm
++// output lane 1) and s(l0) into BE word 3 (wasm output lane 0) via the
++// shared PackTwoWordsToLowHalf helper, which has POWER9 and POWER8 paths.
++//
++// dest==src aliasing safe: src is consumed by xvcvdpsp before dest is zeroed.
++void MacroAssembler::convertFloat64x2ToFloat32x4(FloatRegister src,
++                                                 FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  as_xvcvdpsp(scratch, src);
++  ZeroSimd128(*this, dest);
++  PackTwoWordsToLowHalf(*this, scratch, dest);
++}
++
++// f32x4 (low 2 lanes) → f64x2. Wasm `f64x2.promote_low_f32x4`. xvcvspdp
++// converts both BE word 0 and BE word 2 of its source to doubles in BE
++// dwords 0 and 1 respectively. To get wasm lanes 0 and 1 (= input BE
++// words 3 and 2) into those source positions, vmrglw merges low words:
++// VRT.word[0] = VRA.word[2] = wasm lane 1, VRT.word[2] = VRA.word[3] =
++// wasm lane 0 (with replicated copies in odd word slots that xvcvspdp
++// ignores). Output BE dwords land as [double(lane1), double(lane0)],
++// which on PPC64LE storage is exactly the wasm f64x2 output layout.
++//
++// dest==src aliasing safe: vmrglw consumes src into a separate scratch
++// before dest is written.
++//
++// 2 insns, single ScratchSimd128 scope. All ops POWER7+.
++void MacroAssembler::convertFloat32x4ToFloat64x2(FloatRegister src,
++                                                 FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  as_vmrglw(scratch.encoding() & 31, src.encoding() & 31, src.encoding() & 31);
++  as_xvcvspdp(dest, scratch);
++}
++
++void MacroAssembler::unsignedNarrowInt16x8(FloatRegister lhs, FloatRegister rhs,
++                                           FloatRegister dest) {
++  // On LE, VMX pack swaps operand order vs Wasm convention.
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vpkshus), rhs, lhs, dest);
++}
++
++void MacroAssembler::unsignedNarrowInt32x4(FloatRegister lhs, FloatRegister rhs,
++                                           FloatRegister dest) {
++  // On LE, VMX pack swaps operand order vs Wasm convention.
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vpkswus), rhs, lhs, dest);
++}
++
++void MacroAssembler::widenLowInt8x16(FloatRegister src, FloatRegister dest) {
++  // On PPC64 LE, raw vupklsb unpacks the LOW Wasm lanes (not vupkhsb).
++  // GCC vec_unpackh maps to vupklsb on LE (swapped from BE naming).
++  // Raw vupklsb([1..8,-1..-8]) = [1,2,3,4,5,6,7,8].
++  EmitVmxUnary(
++      *this,
++      [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vupklsb(vrt, vrb); },
++      src, dest);
++}
++
++void MacroAssembler::widenHighInt8x16(FloatRegister src, FloatRegister dest) {
++  // On PPC64 LE, raw vupkhsb unpacks the HIGH Wasm lanes.
++  EmitVmxUnary(
++      *this,
++      [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vupkhsb(vrt, vrb); },
++      src, dest);
++}
++
++void MacroAssembler::unsignedWidenLowInt8x16(FloatRegister src,
++                                             FloatRegister dest) {
++  zeroExtend8x16To16x8(src, dest);
++}
++
++void MacroAssembler::unsignedWidenHighInt8x16(FloatRegister src,
++                                              FloatRegister dest) {
++  // vmrghb(zero, src) interleaves zero bytes with the BE-high half of src,
++  // producing zero-extended halfwords of the LE-high (Wasm-high) lanes.
++  ScratchSimd128Scope scratch(*this);
++  as_xxlxor(scratch, scratch, scratch);
++  as_vmrghb(dest.encoding() & 31, scratch.encoding() & 31, src.encoding() & 31);
++}
++
++void MacroAssembler::widenLowInt16x8(FloatRegister src, FloatRegister dest) {
++  // On PPC64 LE, raw vupklsh unpacks LOW Wasm lanes (GCC swaps h/l on LE).
++  EmitVmxUnary(
++      *this,
++      [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vupklsh(vrt, vrb); },
++      src, dest);
++}
++
++void MacroAssembler::widenHighInt16x8(FloatRegister src, FloatRegister dest) {
++  EmitVmxUnary(
++      *this,
++      [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vupkhsh(vrt, vrb); },
++      src, dest);
++}
++
++void MacroAssembler::unsignedWidenLowInt16x8(FloatRegister src,
++                                             FloatRegister dest) {
++  zeroExtend16x8To32x4(src, dest);
++}
++
++void MacroAssembler::unsignedWidenHighInt16x8(FloatRegister src,
++                                              FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  as_xxlxor(scratch, scratch, scratch);
++  as_vmrghh(dest.encoding() & 31, scratch.encoding() & 31, src.encoding() & 31);
++}
++
++void MacroAssembler::widenLowInt32x4(FloatRegister src, FloatRegister dest) {
++  // On PPC64 LE, raw vupklsw unpacks LOW Wasm lanes.
++  EmitVmxUnary(
++      *this,
++      [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vupklsw(vrt, vrb); },
++      src, dest);
++}
++
++void MacroAssembler::unsignedWidenLowInt32x4(FloatRegister src,
++                                             FloatRegister dest) {
++  zeroExtend32x4To64x2(src, dest);
++}
++
++void MacroAssembler::widenHighInt32x4(FloatRegister src, FloatRegister dest) {
++  EmitVmxUnary(
++      *this,
++      [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vupkhsw(vrt, vrb); },
++      src, dest);
++}
++
++void MacroAssembler::unsignedWidenHighInt32x4(FloatRegister src,
++                                              FloatRegister dest) {
++  // i64x2.extend_high_i32x4_u: take high 2 i32 lanes of src, zero-extend
++  // to i64 each. Use vmrghw to interleave a zero VR with src — same shape
++  // as the (already-correct) unsignedWidenHighInt16x8 sibling above.
++  ScratchSimd128Scope scratch(*this);
++  ZeroSimd128(*this, scratch);
++  as_vmrghw(dest.encoding() & 31, scratch.encoding() & 31, src.encoding() & 31);
++}
++
++void MacroAssembler::pseudoMinFloat32x4(FloatRegister rhsOrRhsDest,
++                                        FloatRegister lhsOrLhsDest) {
++  // pmin: result[i] = rhs[i] < lhs[i] ? rhs[i] : lhs[i]
++  // xvcmpgtsp(mask, lhs, rhs) → 1 where lhs > rhs (i.e., rhs < lhs)
++  // xxsel: mask=1 → XB=rhs. mask=0 → XA=lhs.
++  // Result goes to lhsOrLhsDest (second param).
++  ScratchSimd128Scope scratch(*this);
++  as_xvcmpgtsp(scratch, lhsOrLhsDest, rhsOrRhsDest);
++  as_xxsel(lhsOrLhsDest, lhsOrLhsDest, rhsOrRhsDest, scratch);
++}
++
++void MacroAssembler::pseudoMinFloat32x4(FloatRegister lhs, FloatRegister rhs,
++                                        FloatRegister dest) {
++  // pmin(lhs, rhs) = rhs < lhs ? rhs : lhs
++  // Inline to handle dest aliasing with either operand.
++  ScratchSimd128Scope scratch(*this);
++  as_xvcmpgtsp(scratch, lhs, rhs);
++  // mask=1 where lhs > rhs. XC=1 → select XB=rhs. XC=0 → select XA=lhs.
++  as_xxsel(dest, lhs, rhs, scratch);
++}
++
++void MacroAssembler::pseudoMinFloat64x2(FloatRegister rhsOrRhsDest,
++                                        FloatRegister lhsOrLhsDest) {
++  ScratchSimd128Scope scratch(*this);
++  as_xvcmpgtdp(scratch, lhsOrLhsDest, rhsOrRhsDest);
++  as_xxsel(lhsOrLhsDest, lhsOrLhsDest, rhsOrRhsDest, scratch);
++}
++
++void MacroAssembler::pseudoMinFloat64x2(FloatRegister lhs, FloatRegister rhs,
++                                        FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  as_xvcmpgtdp(scratch, lhs, rhs);
++  as_xxsel(dest, lhs, rhs, scratch);
++}
++
++void MacroAssembler::pseudoMaxFloat32x4(FloatRegister rhsOrRhsDest,
++                                        FloatRegister lhsOrLhsDest) {
++  ScratchSimd128Scope scratch(*this);
++  as_xvcmpgtsp(scratch, rhsOrRhsDest, lhsOrLhsDest);
++  as_xxsel(lhsOrLhsDest, lhsOrLhsDest, rhsOrRhsDest, scratch);
++}
++
++void MacroAssembler::pseudoMaxFloat32x4(FloatRegister lhs, FloatRegister rhs,
++                                        FloatRegister dest) {
++  // pmax(lhs, rhs) = lhs < rhs ? rhs : lhs
++  ScratchSimd128Scope scratch(*this);
++  as_xvcmpgtsp(scratch, rhs, lhs);
++  // mask=1 where rhs > lhs (lhs < rhs). XC=1 → select XB=rhs. XC=0 → select
++  // XA=lhs.
++  as_xxsel(dest, lhs, rhs, scratch);
++}
++
++void MacroAssembler::pseudoMaxFloat64x2(FloatRegister rhsOrRhsDest,
++                                        FloatRegister lhsOrLhsDest) {
++  ScratchSimd128Scope scratch(*this);
++  as_xvcmpgtdp(scratch, rhsOrRhsDest, lhsOrLhsDest);
++  as_xxsel(lhsOrLhsDest, lhsOrLhsDest, rhsOrRhsDest, scratch);
++}
++
++void MacroAssembler::pseudoMaxFloat64x2(FloatRegister lhs, FloatRegister rhs,
++                                        FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  as_xvcmpgtdp(scratch, rhs, lhs);
++  as_xxsel(dest, lhs, rhs, scratch);
++}
++
++void MacroAssembler::dotInt8x16Int7x16(FloatRegister lhs, FloatRegister rhs,
++                                       FloatRegister dest) {
++  // result[k] = lhs[2k]*rhs[2k] + lhs[2k+1]*rhs[2k+1] for k=0..7.
++  // vmulesb/vmulosb produce even/odd byte products as i16 in matching
++  // halfword lanes; vadduhm sums them pairwise.
++  ScratchSimd128Scope scratch(*this);
++  uint8_t l = lhs.encoding() & 31;
++  uint8_t r = rhs.encoding() & 31;
++  uint8_t d = dest.encoding() & 31;
++  uint8_t s = scratch.encoding() & 31;
++  as_vmulesb(s, l, r);
++  as_vmulosb(d, l, r);
++  as_vadduhm(d, s, d);
++}
++
++void MacroAssembler::ceilFloat32x4(FloatRegister src, FloatRegister dest) {
++  as_xvrspip(dest, src);
++}
++
++void MacroAssembler::ceilFloat64x2(FloatRegister src, FloatRegister dest) {
++  as_xvrdpip(dest, src);
++}
++
++void MacroAssembler::floorFloat32x4(FloatRegister src, FloatRegister dest) {
++  as_xvrspim(dest, src);
++}
++
++void MacroAssembler::floorFloat64x2(FloatRegister src, FloatRegister dest) {
++  as_xvrdpim(dest, src);
++}
++
++void MacroAssembler::truncFloat32x4(FloatRegister src, FloatRegister dest) {
++  as_xvrspiz(dest, src);
++}
++
++void MacroAssembler::truncFloat64x2(FloatRegister src, FloatRegister dest) {
++  as_xvrdpiz(dest, src);
++}
++
++void MacroAssembler::nearestFloat32x4(FloatRegister src, FloatRegister dest) {
++  as_xvrspic(dest, src);
++}
++
++void MacroAssembler::nearestFloat64x2(FloatRegister src, FloatRegister dest) {
++  as_xvrdpic(dest, src);
++}
++
++void MacroAssembler::fnmaFloat32x4(FloatRegister src1, FloatRegister src2,
++                                   FloatRegister srcDest) {
++  as_xvnmsubasp(srcDest, src1, src2);
++}
++
++void MacroAssembler::fnmaFloat64x2(FloatRegister src1, FloatRegister src2,
++                                   FloatRegister srcDest) {
++  as_xvnmsubadp(srcDest, src1, src2);
++}
++
++void MacroAssembler::minFloat32x4Relaxed(FloatRegister src,
++                                         FloatRegister srcDest) {
++  as_xvminsp(srcDest, srcDest, src);
++}
++
++void MacroAssembler::minFloat32x4Relaxed(FloatRegister lhs, FloatRegister rhs,
++                                         FloatRegister dest) {
++  as_xvminsp(dest, lhs, rhs);
++}
++
++void MacroAssembler::maxFloat32x4Relaxed(FloatRegister src,
++                                         FloatRegister srcDest) {
++  as_xvmaxsp(srcDest, srcDest, src);
++}
++
++void MacroAssembler::maxFloat32x4Relaxed(FloatRegister lhs, FloatRegister rhs,
++                                         FloatRegister dest) {
++  as_xvmaxsp(dest, lhs, rhs);
++}
++
++void MacroAssembler::minFloat64x2Relaxed(FloatRegister src,
++                                         FloatRegister srcDest) {
++  as_xvmindp(srcDest, srcDest, src);
++}
++
++void MacroAssembler::minFloat64x2Relaxed(FloatRegister lhs, FloatRegister rhs,
++                                         FloatRegister dest) {
++  as_xvmindp(dest, lhs, rhs);
++}
++
++void MacroAssembler::maxFloat64x2Relaxed(FloatRegister src,
++                                         FloatRegister srcDest) {
++  as_xvmaxdp(srcDest, srcDest, src);
++}
++
++void MacroAssembler::maxFloat64x2Relaxed(FloatRegister lhs, FloatRegister rhs,
++                                         FloatRegister dest) {
++  as_xvmaxdp(dest, lhs, rhs);
++}
++
++void MacroAssembler::q15MulrInt16x8Relaxed(FloatRegister lhs, FloatRegister rhs,
++                                           FloatRegister dest) {
++  q15MulrSatInt16x8(lhs, rhs, dest);
++}
++
++// SIMD overloads accepting an extra FloatRegister temp (shared-header signature
++// used by x86; on PPC64 the temp is unused for most of these).
++void MacroAssembler::popcntInt8x16(FloatRegister src, FloatRegister dest,
++                                   FloatRegister temp) {
++  popcntInt8x16(src, dest);
++}
++
++void MacroAssembler::unsignedTruncSatFloat32x4ToInt32x4(FloatRegister src,
++                                                        FloatRegister dest,
++                                                        FloatRegister temp) {
++  unsignedTruncSatFloat32x4ToInt32x4(src, dest);
++}
++
++void MacroAssembler::dotInt8x16Int7x16ThenAdd(FloatRegister lhs,
++                                              FloatRegister rhs,
++                                              FloatRegister dest,
++                                              FloatRegister temp) {
++  // dest += pairwise_widen_i16_to_i32(dot_i8x16(lhs, rhs)).
++  //
++  // Step 1: i16x8 dot of i8 byte pairs (vmulesb/vmulosb/vadduhm). Keeps
++  // the existing signed-byte multiply semantics that match ARM64 sdot
++  // and x86 vpdpbssd (vmsummbm would be signed×unsigned and diverge for
++  // i7 lanes that bit-pattern as negative).
++  //
++  // Step 2: vmsumshm dest, dot, splat_hw(1), dest computes
++  //   dest.i32[k] = dest.i32[k] + dot.i16[2k]*1 + dot.i16[2k+1]*1
++  // which is exactly pairwise widen + accumulate in a single insn.
++  // splat_hw(1) is a single vspltish (5-bit SIMM splat to all 8 halfwords).
++  ScratchSimd128Scope scratch(*this);
++  uint8_t l = lhs.encoding() & 31;
++  uint8_t r = rhs.encoding() & 31;
++  uint8_t d = dest.encoding() & 31;
++  uint8_t s = scratch.encoding() & 31;
++  uint8_t t = temp.encoding() & 31;
++
++  as_vmulesb(s, l, r);
++  as_vmulosb(t, l, r);
++  as_vadduhm(t, s, t);
++  as_vspltish(s, 1);
++  as_vmsumshm(d, t, s, d);
++}
++
++// SIMD ops ported from arm64- and x86/x64-shaped signatures.
++
++void MacroAssembler::permuteInt16x8(const uint16_t lanes[8], FloatRegister src,
++                                    FloatRegister dest) {
++  uint8_t shuffleLanes[16];
++  for (unsigned i = 0; i < 8; i++) {
++    shuffleLanes[i * 2] = lanes[i] * 2;
++    shuffleLanes[i * 2 + 1] = lanes[i] * 2 + 1;
++  }
++  shuffleInt8x16(shuffleLanes, src, src, dest);
++}
++
++void MacroAssembler::rotateRightSimd128(FloatRegister src, FloatRegister dest,
++                                        uint32_t shift) {
++  MOZ_ASSERT(shift < 16);
++  if (shift == 0) {
++    moveSimd128(src, dest);
++    return;
++  }
++  // vsldoi VRT,VRA,VRB,SH: concatenate VRA||VRB, take bytes [SH..SH+15].
++  // Rotate right by N = vsldoi(src, src, 16-N).
++  as_vsldoi(dest, src, src, 16 - shift);
++}
++
++void MacroAssembler::mulInt64x2(FloatRegister lhs, FloatRegister rhs,
++                                FloatRegister dest, FloatRegister temp1,
++                                FloatRegister temp2) {
++  // POWER10 collapses the entire i64x2 multiply to a single vmulld.
++  // POWER9/POWER8 fall back to the GPR round-trip path: extract each
++  // lane pair into GPRs (mfvsrld for LE-dw0/Wasm-lane-0, mfvsrd for
++  // LE-dw1/lane-1), multiply, and reassemble via mtvsrd + xxpermdi.
++  if (HasPOWER10()) {
++    as_vmulld(dest.encoding() & 31, lhs.encoding() & 31, rhs.encoding() & 31);
++    return;
++  }
++  // Aliasing safety: stash the lane-0 product in ScratchSimd128 (which
++  // is non-allocatable, so cannot alias lhs/rhs) and only write dest at
++  // the very end, after both lhs and rhs have been fully consumed.
++  ScratchSimd128Scope scratch(*this);
++  UseScratchRegisterScope temps(asMasm());
++  Register a = temps.Acquire();
++  Register b = temps.Acquire();
++
++  if (HasPOWER9()) {
++    as_mfvsrld(a, lhs);
++    as_mfvsrld(b, rhs);
++  } else {
++    as_xxpermdi(scratch, lhs, lhs, 2);
++    as_mfvsrd(a, scratch);
++    as_xxpermdi(scratch, rhs, rhs, 2);
++    as_mfvsrd(b, scratch);
++  }
++  as_mulld(a, a, b);
++  as_mtvsrd(scratch, a);
++
++  as_mfvsrd(a, lhs);
++  as_mfvsrd(b, rhs);
++  as_mulld(a, a, b);
++  as_mtvsrd(dest, a);
++  as_xxpermdi(dest, dest, scratch, 0);
++}
++
++void MacroAssembler::bitwiseAndNotSimd128(FloatRegister lhs, FloatRegister rhs,
++                                          FloatRegister dest) {
++  // andnot(lhs, rhs) = lhs & ~rhs = xxlandc(lhs, rhs)
++  as_xxlandc(dest, lhs, rhs);
++}
++
++void MacroAssembler::bitwiseSelectSimd128(FloatRegister onTrue,
++                                          FloatRegister onFalse,
++                                          FloatRegister maskDest) {
++  // result = (onTrue & mask) | (onFalse & ~mask)
++  // xxsel: XC=0→XA, XC=1→XB → XT = (XA & ~XC) | (XB & XC)
++  // Need XA=onFalse, XB=onTrue, XC=mask.
++  as_xxsel(maskDest, onFalse, onTrue, maskDest);
++}
++
++void MacroAssembler::popcntInt8x16(FloatRegister src, FloatRegister dest) {
++  EmitVmxUnary(
++      *this,
++      [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vpopcntb(vrt, vrb); },
++      src, dest);
++}
++
++void MacroAssembler::bitmaskInt8x16(FloatRegister src, Register dest,
++                                    FloatRegister temp) {
++  if (HasPOWER10()) {
++    // Single-instruction collapse on POWER10.
++    as_vextractbm(dest, src);
++    return;
++  }
++  // POWER8+ vbpermq-based bitmask: ctl[i] = (15-i)*8 produces the wasm-spec
++  // bitmap (bit i = MSB of LE lane i) in dw0 low 16 bits.
++  int8_t ctl[16] = {120, 112, 104, 96, 88, 80, 72, 64,
++                    56,  48,  40,  32, 24, 16, 8,  0};
++  loadConstantSimd128(SimdConstant::CreateX16(ctl), temp);
++  as_vbpermq(temp.encoding() & 31, src.encoding() & 31, temp.encoding() & 31);
++  as_mfvsrd(dest, temp);
++}
++
++void MacroAssembler::bitmaskInt16x8(FloatRegister src, Register dest,
++                                    FloatRegister temp) {
++  if (HasPOWER10()) {
++    as_vextracthm(dest, src);
++    return;
++  }
++  // Same recipe as bitmaskInt8x16 but ctl picks halfword MSBs:
++  // BE bit (14-2i)*8 for lane i, plus 8 ignore-bytes (high bit set).
++  int8_t ctl[16] = {112,  96,   80,   64,   48,   32,   16,   0,
++                    -128, -128, -128, -128, -128, -128, -128, -128};
++  loadConstantSimd128(SimdConstant::CreateX16(ctl), temp);
++  as_vbpermq(temp.encoding() & 31, src.encoding() & 31, temp.encoding() & 31);
++  as_mfvsrd(dest, temp);
++}
++
++void MacroAssembler::bitmaskInt32x4(FloatRegister src, Register dest,
++                                    FloatRegister temp) {
++  if (HasPOWER10()) {
++    as_vextractwm(dest, src);
++    return;
++  }
++  // Same recipe as bitmaskInt8x16 but ctl picks word MSBs:
++  // BE bit (12-4i)*8 for lane i, plus 12 ignore-bytes (high bit set).
++  int8_t ctl[16] = {96,   64,   32,   0,    -128, -128, -128, -128,
++                    -128, -128, -128, -128, -128, -128, -128, -128};
++  loadConstantSimd128(SimdConstant::CreateX16(ctl), temp);
++  as_vbpermq(temp.encoding() & 31, src.encoding() & 31, temp.encoding() & 31);
++  as_mfvsrd(dest, temp);
++}
++
++void MacroAssembler::bitmaskInt64x2(FloatRegister src, Register dest,
++                                    FloatRegister temp) {
++  if (HasPOWER10()) {
++    as_vextractdm(dest, src);
++    return;
++  }
++  // Same recipe as the other bitmask variants. ctl picks dword MSBs:
++  // BE bit 64 for lane 0, BE bit 0 for lane 1, plus 14 ignore-bytes.
++  int8_t ctl[16] = {64,   0,    -128, -128, -128, -128, -128, -128,
++                    -128, -128, -128, -128, -128, -128, -128, -128};
++  loadConstantSimd128(SimdConstant::CreateX16(ctl), temp);
++  as_vbpermq(temp.encoding() & 31, src.encoding() & 31, temp.encoding() & 31);
++  as_mfvsrd(dest, temp);
++}
++
++void MacroAssembler::compareInt64x2(Assembler::Condition cond,
++                                    FloatRegister rhs, FloatRegister lhsDest) {
++  compareInt64x2(cond, lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::compareInt64x2(Assembler::Condition cond,
++                                    FloatRegister lhs, FloatRegister rhs,
++                                    FloatRegister dest) {
++  EmitVmxCompare(*this, cond, lhs, rhs, dest, VMX_BINARY_WRAPPER(vcmpequd),
++                 VMX_BINARY_WRAPPER(vcmpgtsd), VMX_BINARY_WRAPPER(vcmpgtud));
++}
++
++void MacroAssembler::minFloat32x4(FloatRegister rhs, FloatRegister lhsDest) {
++  minFloat32x4(lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::minFloat32x4(FloatRegister lhs, FloatRegister rhs,
++                                  FloatRegister dest) {
++  as_xvminsp(dest, lhs, rhs);
++}
++
++void MacroAssembler::minFloat32x4(FloatRegister lhs, FloatRegister rhs,
++                                  FloatRegister dest, FloatRegister temp1,
++                                  FloatRegister temp2) {
++  // Wasm min with NaN propagation.
++  // Detect NaN in either operand (not via add which falsely flags inf+(-inf)).
++  // Compute mask and add BEFORE min (min may clobber lhs via dest aliasing).
++  as_xvcmpeqsp(temp1, lhs, lhs);
++  as_xvcmpeqsp(temp2, rhs, rhs);
++  as_xxland(temp1, temp1, temp2);
++  as_xvaddsp(temp2, lhs, rhs);
++  as_xvminsp(dest, lhs, rhs);
++  as_xxsel(dest, temp2, dest, temp1);
++}
++
++void MacroAssembler::minFloat64x2(FloatRegister rhs, FloatRegister lhsDest) {
++  minFloat64x2(lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::minFloat64x2(FloatRegister lhs, FloatRegister rhs,
++                                  FloatRegister dest) {
++  as_xvmindp(dest, lhs, rhs);
++}
++
++void MacroAssembler::minFloat64x2(FloatRegister lhs, FloatRegister rhs,
++                                  FloatRegister dest, FloatRegister temp1,
++                                  FloatRegister temp2) {
++  // NaN mask and add must be computed BEFORE min (which may clobber lhs via
++  // dest).
++  as_xvcmpeqdp(temp1, lhs, lhs);
++  as_xvcmpeqdp(temp2, rhs, rhs);
++  as_xxland(temp1, temp1, temp2);  // temp1 = ~0 when both non-NaN
++  as_xvadddp(temp2, lhs, rhs);     // temp2 = add (NaN source)
++  as_xvmindp(dest, lhs, rhs);      // dest = min (may clobber lhs)
++  as_xxsel(dest, temp2, dest, temp1);
++}
++
++void MacroAssembler::maxFloat32x4(FloatRegister rhs, FloatRegister lhsDest) {
++  maxFloat32x4(lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::maxFloat32x4(FloatRegister lhs, FloatRegister rhs,
++                                  FloatRegister dest) {
++  as_xvmaxsp(dest, lhs, rhs);
++}
++
++void MacroAssembler::maxFloat32x4(FloatRegister lhs, FloatRegister rhs,
++                                  FloatRegister dest, FloatRegister temp1,
++                                  FloatRegister temp2) {
++  // Wasm max with NaN propagation, using temp registers.
++  as_xvcmpeqsp(temp1, lhs, lhs);
++  as_xvcmpeqsp(temp2, rhs, rhs);
++  as_xxland(temp1, temp1, temp2);
++  as_xvaddsp(temp2, lhs, rhs);
++  as_xvmaxsp(dest, lhs, rhs);
++  as_xxsel(dest, temp2, dest, temp1);
++}
++
++void MacroAssembler::maxFloat64x2(FloatRegister rhs, FloatRegister lhsDest) {
++  maxFloat64x2(lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::maxFloat64x2(FloatRegister lhs, FloatRegister rhs,
++                                  FloatRegister dest) {
++  as_xvmaxdp(dest, lhs, rhs);
++}
++
++void MacroAssembler::maxFloat64x2(FloatRegister lhs, FloatRegister rhs,
++                                  FloatRegister dest, FloatRegister temp1,
++                                  FloatRegister temp2) {
++  as_xvcmpeqdp(temp1, lhs, lhs);
++  as_xvcmpeqdp(temp2, rhs, rhs);
++  as_xxland(temp1, temp1, temp2);
++  as_xvadddp(temp2, lhs, rhs);
++  as_xvmaxdp(dest, lhs, rhs);
++  as_xxsel(dest, temp2, dest, temp1);
++}
++
++void MacroAssembler::unsignedTruncSatFloat32x4ToInt32x4(FloatRegister src,
++                                                        FloatRegister dest) {
++  as_xvcvspuxws(dest, src);
++}
++
++void MacroAssembler::extractLaneInt64x2(uint32_t lane, FloatRegister src,
++                                        Register64 dest) {
++  MOZ_ASSERT(lane < 2);
++  if (lane == 1) {
++    // Lane 1 = BE dword 0 = register bits[0:63].
++    as_mfvsrd(dest.reg, src);
++  } else {
++    // Lane 0 = BE dword 1.
++    if (HasPOWER9()) {
++      as_mfvsrld(dest.reg, src);
++    } else {
++      ScratchSimd128Scope scratch(*this);
++      as_xxpermdi(scratch, src, src, 2);
++      as_mfvsrd(dest.reg, scratch);
++    }
++  }
++}
++
++void MacroAssembler::replaceLaneInt64x2(unsigned lane, Register64 rhs,
++                                        FloatRegister lhsDest) {
++  MOZ_ASSERT(lane < 2);
++  if (HasPOWER10()) {
++    // 1 insn, no scratch VSR. UIM byte offset: lane 0 → 8, lane 1 → 0.
++    as_vinsd(lhsDest, rhs.reg, (1 - lane) * 8);
++    return;
++  }
++  ScratchSimd128Scope scratch(*this);
++  as_mtvsrd(scratch, rhs.reg);
++  if (lane == 0) {
++    // Replace dw1 (LE low = lane 0). Keep dw0 (lane 1).
++    // dm=0b00: [lhsDest.dw0, scratch.dw0]
++    as_xxpermdi(lhsDest, lhsDest, scratch, 0);
++  } else {
++    // Replace dw0 (LE high = lane 1). Keep dw1 (lane 0).
++    // dm=0b01: [scratch.dw0, lhsDest.dw1]
++    as_xxpermdi(lhsDest, scratch, lhsDest, 1);
++  }
++}
++
++// SIMD 3-operand arithmetic (x86_shared-style signatures).
++
++void MacroAssembler::addFloat32x4(FloatRegister lhs, FloatRegister rhs,
++                                  FloatRegister dest) {
++  as_xvaddsp(dest, lhs, rhs);
++}
++
++void MacroAssembler::addFloat64x2(FloatRegister lhs, FloatRegister rhs,
++                                  FloatRegister dest) {
++  as_xvadddp(dest, lhs, rhs);
++}
++
++void MacroAssembler::addInt16x8(FloatRegister lhs, FloatRegister rhs,
++                                FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vadduhm), lhs, rhs, dest);
++}
++
++void MacroAssembler::addInt8x16(FloatRegister lhs, FloatRegister rhs,
++                                FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vaddubm), lhs, rhs, dest);
++}
++
++void MacroAssembler::divFloat32x4(FloatRegister lhs, FloatRegister rhs,
++                                  FloatRegister dest) {
++  as_xvdivsp(dest, lhs, rhs);
++}
++
++void MacroAssembler::extractLaneInt16x8(uint32_t lane, FloatRegister src,
++                                        Register dest) {
++  MOZ_ASSERT(lane < 8);
++  if (HasPOWER9()) {
++    as_vextractuh(ScratchSimd128Reg, src, 14 - 2 * lane);
++    as_mfvsrd(dest, ScratchSimd128Reg);
++    as_extsh(dest, dest);
++    return;
++  }
++  ExtractLaneToGPR(*this, lane, src, dest, 2, 16);
++  as_extsh(dest, dest);
++}
++
++void MacroAssembler::extractLaneInt32x4(uint32_t lane, FloatRegister src,
++                                        Register dest) {
++  MOZ_ASSERT(lane < 4);
++  ExtractLaneToGPR(*this, lane, src, dest, 4, 32);
++  // ExtractLaneToGPR leaves the adjacent lane in the high 32 bits for the
++  // unshifted lanes (0 and 2); canonicalize to a sign-extended i32, as the
++  // i8x16/i16x8 extracts do with extsb/extsh. A consumer that reads the full
++  // 64-bit register -- e.g. the POWER8 i32.ctz emulation, whose 64-bit neg/and.
++  // with a 32-bit cntlzw otherwise mis-handles a zero low word over nonzero
++  // high garbage and returns -1 -- requires this.
++  as_extsw(dest, dest);
++}
++
++void MacroAssembler::extractLaneInt8x16(uint32_t lane, FloatRegister src,
++                                        Register dest) {
++  MOZ_ASSERT(lane < 16);
++  if (HasPOWER9()) {
++    as_vextractub(ScratchSimd128Reg, src, 15 - lane);
++    as_mfvsrd(dest, ScratchSimd128Reg);
++    as_extsb(dest, dest);
++    return;
++  }
++  ExtractLaneToGPR(*this, lane, src, dest, 1, 8);
++  as_extsb(dest, dest);
++}
++
++void MacroAssembler::maxInt16x8(FloatRegister lhs, FloatRegister rhs,
++                                FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmaxsh), lhs, rhs, dest);
++}
++
++void MacroAssembler::maxInt32x4(FloatRegister lhs, FloatRegister rhs,
++                                FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmaxsw), lhs, rhs, dest);
++}
++
++void MacroAssembler::maxInt8x16(FloatRegister lhs, FloatRegister rhs,
++                                FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmaxsb), lhs, rhs, dest);
++}
++
++void MacroAssembler::minInt8x16(FloatRegister lhs, FloatRegister rhs,
++                                FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vminsb), lhs, rhs, dest);
++}
++
++void MacroAssembler::mulInt32x4(FloatRegister lhs, FloatRegister rhs,
++                                FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmuluwm), lhs, rhs, dest);
++}
++
++void MacroAssembler::narrowInt16x8(FloatRegister lhs, FloatRegister rhs,
++                                   FloatRegister dest) {
++  // On LE, VMX pack swaps operand order vs Wasm convention.
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vpkshss), rhs, lhs, dest);
++}
++
++void MacroAssembler::splatX2(Register64 src, FloatRegister dest) {
++  if (HasPOWER9()) {
++    as_mtvsrdd(dest, src.reg, src.reg);
++  } else {
++    as_mtvsrd(dest, src.reg);
++    as_xxpermdi(dest, dest, dest, 0);
++  }
++}
++
++void MacroAssembler::subInt32x4(FloatRegister lhs, FloatRegister rhs,
++                                FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsubuwm), lhs, rhs, dest);
++}
++
++void MacroAssembler::swizzleInt8x16(FloatRegister lhs, FloatRegister rhs,
++                                    FloatRegister dest) {
++  // Wasm i8x16.swizzle: result[i] = (rhs[i] < 16) ? lhs[rhs[i]] : 0.
++  //
++  // Strategy: build ctrl in ScratchSimd128 (which can't alias inputs
++  // because v0 is non-allocatable). Use vsububs(splat(15), rhs) to
++  // produce ctrl = max(0, 15 - rhs); the saturation clamps out-of-range
++  // indices to 0, and those positions get masked off below.
++  //
++  // The mask is computed via vcmpgtub(rhs, splat(15)) + xxlnor — 0xFF
++  // where rhs <= 15. Reformulating "rhs < 16" as "!(rhs > 15)" lets us
++  // use vspltisb with a 5-bit signed immediate (P7+, 1 insn, no GPR
++  // scratch) for both splat-of-15 sites, replacing the previous
++  // movePtr(0x0F0F0F0F)/movePtr(0x10101010) + splatX4 dance.
++  //
++  // Aliasing: dest may equal lhs (wasm baseline calls swizzleInt8x16(
++  // rsd, rs, rsd); Ion's useRegisterAtStart permits the same). When
++  // dest != rhs, ctrl can be built in scratch and the mask computed
++  // after the permute (rhs is still alive). When dest == rhs, the
++  // permute would clobber rhs before we could compute the mask, so the
++  // mask goes to the red zone first.
++  ScratchSimd128Scope scratch(*this);
++  uint8_t s = scratch.encoding() & 31;
++  uint8_t l = lhs.encoding() & 31;
++  uint8_t r = rhs.encoding() & 31;
++  uint8_t d = dest.encoding() & 31;
++
++  if (dest != rhs) {
++    as_vspltisb(s, 15);
++    as_vsububs(s, s, r);   // scratch = ctrl
++    as_vperm(d, l, l, s);  // dest = vperm(lhs, lhs, ctrl)
++    as_vspltisb(s, 15);
++    as_vcmpgtub(s, r, s);             // scratch = 0xFF where rhs > 15
++    as_xxlandc(dest, dest, scratch);  // dest &= ~scratch (= bytes-to-keep)
++    return;
++  }
++
++  // dest == rhs: vperm clobbers rhs, so build the bytes-to-zero mask first
++  // and stash it. The xxlandc at the end consumes the un-inverted form.
++  as_vspltisb(s, 15);
++  as_vcmpgtub(s, r, s);  // scratch = 0xFF where rhs > 15
++  RedZoneStashSimd128(*this, scratch, 0);
++  as_vspltisb(s, 15);
++  as_vsububs(s, s, r);   // scratch = ctrl
++  as_vperm(d, l, l, s);  // dest = vperm(lhs, lhs, ctrl)
++  RedZoneRestoreSimd128(*this, 0, scratch);
++  as_xxlandc(dest, dest, scratch);  // dest &= ~scratch (= bytes-to-keep)
++}
++// SIMD 3-operand arithmetic (continued).
++
++void MacroAssembler::addInt32x4(FloatRegister lhs, FloatRegister rhs,
++                                FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vadduwm), lhs, rhs, dest);
++}
++
++void MacroAssembler::addInt64x2(FloatRegister lhs, FloatRegister rhs,
++                                FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vaddudm), lhs, rhs, dest);
++}
++
++void MacroAssembler::addSatInt16x8(FloatRegister lhs, FloatRegister rhs,
++                                   FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vaddshs), lhs, rhs, dest);
++}
++
++void MacroAssembler::addSatInt8x16(FloatRegister lhs, FloatRegister rhs,
++                                   FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vaddsbs), lhs, rhs, dest);
++}
++
++void MacroAssembler::divFloat64x2(FloatRegister lhs, FloatRegister rhs,
++                                  FloatRegister dest) {
++  as_xvdivdp(dest, lhs, rhs);
++}
++
++void MacroAssembler::minInt16x8(FloatRegister lhs, FloatRegister rhs,
++                                FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vminsh), lhs, rhs, dest);
++}
++
++void MacroAssembler::minInt32x4(FloatRegister lhs, FloatRegister rhs,
++                                FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vminsw), lhs, rhs, dest);
++}
++
++void MacroAssembler::mulFloat32x4(FloatRegister lhs, FloatRegister rhs,
++                                  FloatRegister dest) {
++  as_xvmulsp(dest, lhs, rhs);
++}
++
++void MacroAssembler::mulFloat64x2(FloatRegister lhs, FloatRegister rhs,
++                                  FloatRegister dest) {
++  as_xvmuldp(dest, lhs, rhs);
++}
++
++void MacroAssembler::mulInt16x8(FloatRegister lhs, FloatRegister rhs,
++                                FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  ZeroSimd128(*this, scratch);
++  EmitVmxTernary(
++      *this,
++      [](Assembler& a, uint8_t vrt, uint8_t vra, uint8_t vrb, uint8_t vrc) {
++        a.as_vmladduhm(vrt, vra, vrb, vrc);
++      },
++      lhs, rhs, scratch, dest);
++}
++
++void MacroAssembler::narrowInt32x4(FloatRegister lhs, FloatRegister rhs,
++                                   FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vpkswss), rhs, lhs, dest);
++}
++
++void MacroAssembler::subFloat32x4(FloatRegister lhs, FloatRegister rhs,
++                                  FloatRegister dest) {
++  as_xvsubsp(dest, lhs, rhs);
++}
++
++void MacroAssembler::subFloat64x2(FloatRegister lhs, FloatRegister rhs,
++                                  FloatRegister dest) {
++  as_xvsubdp(dest, lhs, rhs);
++}
++
++void MacroAssembler::subInt16x8(FloatRegister lhs, FloatRegister rhs,
++                                FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsubuhm), lhs, rhs, dest);
++}
++
++void MacroAssembler::subInt64x2(FloatRegister lhs, FloatRegister rhs,
++                                FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsubudm), lhs, rhs, dest);
++}
++
++void MacroAssembler::subInt8x16(FloatRegister lhs, FloatRegister rhs,
++                                FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsububm), lhs, rhs, dest);
++}
++
++void MacroAssembler::subSatInt16x8(FloatRegister lhs, FloatRegister rhs,
++                                   FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsubshs), lhs, rhs, dest);
++}
++
++void MacroAssembler::subSatInt8x16(FloatRegister lhs, FloatRegister rhs,
++                                   FloatRegister dest) {
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsubsbs), lhs, rhs, dest);
++}
++
++void MacroAssembler::widenDotInt16x8(FloatRegister lhs, FloatRegister rhs,
++                                     FloatRegister dest) {
++  // i32x4.dot_i16x8_s: result[k] = lhs[2k]*rhs[2k] + lhs[2k+1]*rhs[2k+1].
++  // vmsumshm computes exactly that for each i32 lane plus an addend (VRC).
++  // With VRC = 0, the addend disappears and we get the wasm spec result in
++  // a single instruction. xxlxor zeros the scratch in 1 insn, so total is
++  // 2 insns vs the old vmulesh/vmulosh/vadduwm trio.
++  ScratchSimd128Scope scratch(*this);
++  as_xxlxor(scratch, scratch, scratch);
++  as_vmsumshm(dest.encoding() & 31, lhs.encoding() & 31, rhs.encoding() & 31,
++              scratch.encoding() & 31);
++}
++
++// SIMD variable-shift and FMA helpers.
++// Pattern: splat the GPR shift count across all lanes of a scratch VSR,
++// then issue a vector-shift on lhs and the splat. vsl{b,h} / vsr{b,h} /
++// vsra{b,h} use the low 3-or-4 bits of each lane's shift count, exactly
++// matching wasm modulo-N shift semantics.
++
++void MacroAssembler::leftShiftInt8x16(FloatRegister lhs, Register rhs,
++                                      FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  splatX16(rhs, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vslb), lhs, scratch, dest);
++}
++
++void MacroAssembler::rightShiftInt8x16(FloatRegister lhs, Register rhs,
++                                       FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  splatX16(rhs, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrab), lhs, scratch, dest);
++}
++
++void MacroAssembler::unsignedRightShiftInt8x16(FloatRegister lhs, Register rhs,
++                                               FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  splatX16(rhs, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrb), lhs, scratch, dest);
++}
++
++void MacroAssembler::leftShiftInt16x8(FloatRegister lhs, Register rhs,
++                                      FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  splatX8(rhs, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vslh), lhs, scratch, dest);
++}
++
++void MacroAssembler::rightShiftInt16x8(FloatRegister lhs, Register rhs,
++                                       FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  splatX8(rhs, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrah), lhs, scratch, dest);
++}
++
++void MacroAssembler::unsignedRightShiftInt16x8(FloatRegister lhs, Register rhs,
++                                               FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  splatX8(rhs, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrh), lhs, scratch, dest);
++}
++
++void MacroAssembler::leftShiftInt32x4(FloatRegister lhs, Register rhs,
++                                      FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  splatX4(rhs, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vslw), lhs, scratch, dest);
++}
++
++void MacroAssembler::leftShiftInt64x2(FloatRegister lhs, Register rhs,
++                                      FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  splatX4(rhs, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsld), lhs, scratch, dest);
++}
++
++void MacroAssembler::rightShiftInt32x4(FloatRegister lhs, Register rhs,
++                                       FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  splatX4(rhs, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsraw), lhs, scratch, dest);
++}
++
++void MacroAssembler::rightShiftInt64x2(FloatRegister lhs, Register rhs,
++                                       FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  splatX4(rhs, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrad), lhs, scratch, dest);
++}
++
++void MacroAssembler::unsignedRightShiftInt32x4(FloatRegister lhs, Register rhs,
++                                               FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  splatX4(rhs, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrw), lhs, scratch, dest);
++}
++
++void MacroAssembler::unsignedRightShiftInt64x2(FloatRegister lhs, Register rhs,
++                                               FloatRegister dest) {
++  ScratchSimd128Scope scratch(*this);
++  splatX4(rhs, scratch);
++  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrd), lhs, scratch, dest);
++}
++
++void MacroAssembler::fmaFloat32x4(FloatRegister src1, FloatRegister src2,
++                                  FloatRegister srcDest) {
++  as_xvmaddasp(srcDest, src1, src2);
++}
++
++void MacroAssembler::fmaFloat64x2(FloatRegister src1, FloatRegister src2,
++                                  FloatRegister srcDest) {
++  as_xvmaddadp(srcDest, src1, src2);
++}
++
++//}}} check_macroassembler_style
++
++}  // namespace jit
++}  // namespace js
++
++#endif /* jit_ppc64_MacroAssembler_ppc64_inl_h */
+diff --git a/js/src/jit/ppc64/MacroAssembler-ppc64.cpp b/js/src/jit/ppc64/MacroAssembler-ppc64.cpp
+new file mode 100644
+index 000000000000..702fb3cd4cba
+--- /dev/null
++++ b/js/src/jit/ppc64/MacroAssembler-ppc64.cpp
+@@ -0,0 +1,3467 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#include "jit/ppc64/MacroAssembler-ppc64.h"
++
++#include "jit/Bailouts.h"
++#include "jit/BaselineFrame.h"
++#include "jit/FlushICache.h"
++#include "jit/JitFrames.h"
++#include "jit/JitRuntime.h"
++#include "jit/MacroAssembler.h"
++#include "jit/MoveEmitter.h"
++#include "jit/ppc64/SharedICRegisters-ppc64.h"
++#include "vm/JitActivation.h"
++#include "vm/JSContext.h"
++#include "wasm/WasmStubs.h"
++
++#include "jit/MacroAssembler-inl.h"
++
++namespace js {
++namespace jit {
++
++MacroAssembler& MacroAssemblerPPC64::asMasm() {
++  return *static_cast<MacroAssembler*>(this);
++}
++
++const MacroAssembler& MacroAssemblerPPC64::asMasm() const {
++  return *static_cast<const MacroAssembler*>(this);
++}
++
++// ===============================================================
++// Out-of-line fake exit frame
++
++bool MacroAssemblerPPC64Compat::buildOOLFakeExitFrame(void* fakeReturnAddr) {
++  asMasm().Push(FrameDescriptor(FrameType::IonJS));
++  asMasm().Push(ImmPtr(fakeReturnAddr));
++  asMasm().Push(FramePointer);
++  return true;
++}
++
++// ===============================================================
++// Load int32 or double from memory
++
++void MacroAssemblerPPC64Compat::loadInt32OrDouble(const Address& src,
++                                                  FloatRegister dest) {
++  UseScratchRegisterScope temps(*this);
++  Register scratch = temps.Acquire();
++  Label end;
++
++  // Load the boxed value and stash in the FPR immediately, then reuse the
++  // GPR for the tag test.  Only one scratch GPR is held here so that
++  // branchTestInt32 can acquire the second one for the ImmTag constant.
++  loadPtr(Address(src.base, src.offset), scratch);
++  as_mtvsrd(dest, scratch);
++  x_srdi(scratch, scratch, JSVAL_TAG_SHIFT);
++  asMasm().branchTestInt32(Assembler::NotEqual, scratch, &end);
++  // It was an int32.  Recover the boxed value from the FPR, sign-extend
++  // the low 32 bits, and convert to double.
++  as_mfvsrd(scratch, dest);
++  as_extsw(scratch, scratch);
++  as_mtvsrd(dest, scratch);
++  as_fcfid(dest, dest);
++
++  bind(&end);
++}
++
++void MacroAssemblerPPC64Compat::loadInt32OrDouble(const BaseIndex& addr,
++                                                  FloatRegister dest) {
++  UseScratchRegisterScope temps(*this);
++  Register scratch = temps.Acquire();
++  Label end;
++
++  computeScaledAddress(addr, scratch);
++  loadPtr(Address(scratch, addr.offset), scratch);
++  as_mtvsrd(dest, scratch);
++  x_srdi(scratch, scratch, JSVAL_TAG_SHIFT);
++  asMasm().branchTestInt32(Assembler::NotEqual, scratch, &end);
++  as_mfvsrd(scratch, dest);
++  as_extsw(scratch, scratch);
++  as_mtvsrd(dest, scratch);
++  as_fcfid(dest, dest);
++
++  bind(&end);
++}
++
++// ===============================================================
++// Conversion functions
++
++void MacroAssemblerPPC64Compat::convertUInt32ToDouble(Register src,
++                                                      FloatRegister dest) {
++  // mtvsrwz: VSR[dest].dw0 = zero_ext_64(src[32:63]); P8+ (ISA 2.07).
++  // Replaces rldicl + mtvsrd (2 insns + scratch) with 1 insn.
++  as_mtvsrwz(dest, src);
++  as_fcfid(dest, dest);
++}
++
++void MacroAssemblerPPC64Compat::convertUInt32ToFloat32(Register src,
++                                                       FloatRegister dest) {
++  // mtvsrwz + fcfids; same recipe as convertUInt32ToDouble.
++  as_mtvsrwz(dest, src);
++  as_fcfids(dest, dest);
++}
++
++// Helper for the negative-zero check after a successful round-trip.
++// Precondition: `dest` holds the integer round-trip result; if it equals
++// zero, then `src` was either +0.0 or -0.0 (those are the only doubles
++// that round-trip to int 0). Distinguish them by inspecting src's sign
++// bit: -0.0 has its MSB set, so an mfvsrd-then-signed-cmp-against-zero
++// branches to `fail` only for -0.0. Non-zero `dest` values (including
++// every negative integer) skip the check entirely.
++static void EmitNegativeZeroCheck(MacroAssemblerPPC64Compat& masm,
++                                  FloatRegister src, Register dest,
++                                  Label* fail) {
++  Label notZero;
++  masm.as_cmpdi(dest, 0);
++  masm.ma_b(Assembler::NotEqual, &notZero);
++  UseScratchRegisterScope temps(masm);
++  Register scratch = temps.Acquire();
++  masm.as_mfvsrd(scratch, src);
++  masm.as_cmpdi(scratch, 0);
++  masm.ma_b(Assembler::LessThan, fail);
++  masm.bind(&notZero);
++}
++
++void MacroAssemblerPPC64Compat::convertDoubleToInt32(FloatRegister src,
++                                                     Register dest, Label* fail,
++                                                     bool negativeZeroCheck) {
++  // Truncate to int32 (round toward zero), sign-extend, and verify
++  // exactness via round-trip compare. fctiwz writes the int32 to BE
++  // bits 32..63 of the FPR; mfvsrd extracts and extsw sign-extends.
++  // The compare also catches NaN (unordered) and Inf (saturated to
++  // INT32_{MIN,MAX}, won't round-trip equal).
++  as_fctiwz(ScratchDoubleReg, src);
++  as_mfvsrd(dest, ScratchDoubleReg);
++  as_extsw(dest, dest);
++  as_mtvsrd(ScratchDoubleReg, dest);
++  as_fcfid(ScratchDoubleReg, ScratchDoubleReg);
++  as_fcmpu(ScratchDoubleReg, src);
++  ma_b(Assembler::DoubleNotEqualOrUnordered, fail);
++
++  if (negativeZeroCheck) {
++    EmitNegativeZeroCheck(*this, src, dest, fail);
++  }
++}
++
++void MacroAssemblerPPC64Compat::convertDoubleToPtr(FloatRegister src,
++                                                   Register dest, Label* fail,
++                                                   bool negativeZeroCheck) {
++  // Same pattern as convertDoubleToInt32 but to int64 (no sign-extend
++  // needed since fctidz already produces a 64-bit result).
++  as_fctidz(ScratchDoubleReg, src);
++  as_mfvsrd(dest, ScratchDoubleReg);
++  as_mtvsrd(ScratchDoubleReg, dest);
++  as_fcfid(ScratchDoubleReg, ScratchDoubleReg);
++  as_fcmpu(ScratchDoubleReg, src);
++  ma_b(Assembler::DoubleNotEqualOrUnordered, fail);
++
++  if (negativeZeroCheck) {
++    EmitNegativeZeroCheck(*this, src, dest, fail);
++  }
++}
++
++void MacroAssemblerPPC64Compat::convertFloat32ToInt32(FloatRegister src,
++                                                      Register dest,
++                                                      Label* fail,
++                                                      bool negativeZeroCheck) {
++  // Same as convertDoubleToInt32 but the round-trip uses fcfids so the
++  // comparison happens at single precision (matches src's actual width).
++  as_fctiwz(ScratchDoubleReg, src);
++  as_mfvsrd(dest, ScratchDoubleReg);
++  as_extsw(dest, dest);
++  as_mtvsrd(ScratchDoubleReg, dest);
++  as_fcfids(ScratchDoubleReg, ScratchDoubleReg);
++  as_fcmpu(ScratchDoubleReg, src);
++  ma_b(Assembler::DoubleNotEqualOrUnordered, fail);
++
++  if (negativeZeroCheck) {
++    EmitNegativeZeroCheck(*this, src, dest, fail);
++  }
++}
++
++CodeOffset MacroAssemblerPPC64Compat::toggledCall(JitCode* target,
++                                                  bool enabled) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  // stanza(8) + mtctr/bctrl(2) = 10 instructions.
++  m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
++  BufferOffset boLoad =
++      emitLoad64Stanza(scratch, (uint64_t)uintptr_t(target->raw()));
++  CodeOffset offset(boLoad.getOffset());
++  addPendingJump(boLoad, ImmPtr(target->raw()), RelocationKind::JITCODE);
++  if (enabled) {
++    xs_mtctr(scratch);
++    as_bctr(LinkBit::LinkB);
++  } else {
++    writeInst(NopInst);
++    writeInst(NopInst);
++  }
++  m_buffer.leaveNoPool();
++  MOZ_ASSERT_IF(!oom(), nextOffset().getOffset() - offset.offset() ==
++                            ToggledCallSize(nullptr));
++  return offset;
++}
++
++// ===============================================================
++// Exception handling
++
++void MacroAssemblerPPC64Compat::handleFailureWithHandlerTail(
++    Label* profilerExitTail, Label* bailoutTail,
++    uint32_t* returnValueCheckOffset) {
++  // Round sizeof(ResumeFromException) up to ABIStackAlignment. The
++  // canonical (sz + align - 1) & ~(align - 1) form is exact: when sz
++  // is already a multiple of `align` the rounding is a no-op. The
++  // previous (sz + align) & ~(align - 1) over-allocated by `align`
++  // bytes whenever sz was already aligned.
++  int size = (sizeof(ResumeFromException) + ABIStackAlignment - 1) &
++             ~(ABIStackAlignment - 1);
++  asMasm().subPtr(Imm32(size), StackPointer);
++  // Use r3 (first argument register).
++  mov(StackPointer, r3);
++
++  using Fn = void (*)(ResumeFromException* rfe);
++  asMasm().setupUnalignedABICall(r4);
++  asMasm().passABIArg(r3);
++  asMasm().callWithABI<Fn, HandleException>(
++      ABIType::General, CheckUnsafeCallWithABI::DontCheckHasExitFrame);
++
++  *returnValueCheckOffset = asMasm().currentOffset();
++
++  Label entryFrame;
++  Label catch_;
++  Label finally;
++  Label returnBaseline;
++  Label returnIon;
++  Label bailout;
++  Label wasmInterpEntry;
++  Label wasmCatch;
++
++  load32(Address(StackPointer, ResumeFromException::offsetOfKind()), r3);
++  asMasm().branch32(Assembler::Equal, r3,
++                    Imm32(ExceptionResumeKind::EntryFrame), &entryFrame);
++  asMasm().branch32(Assembler::Equal, r3, Imm32(ExceptionResumeKind::Catch),
++                    &catch_);
++  asMasm().branch32(Assembler::Equal, r3, Imm32(ExceptionResumeKind::Finally),
++                    &finally);
++  asMasm().branch32(Assembler::Equal, r3,
++                    Imm32(ExceptionResumeKind::ForcedReturnBaseline),
++                    &returnBaseline);
++  asMasm().branch32(Assembler::Equal, r3,
++                    Imm32(ExceptionResumeKind::ForcedReturnIon), &returnIon);
++  asMasm().branch32(Assembler::Equal, r3, Imm32(ExceptionResumeKind::Bailout),
++                    &bailout);
++  asMasm().branch32(Assembler::Equal, r3,
++                    Imm32(ExceptionResumeKind::WasmInterpEntry),
++                    &wasmInterpEntry);
++  asMasm().branch32(Assembler::Equal, r3, Imm32(ExceptionResumeKind::WasmCatch),
++                    &wasmCatch);
++
++  breakpoint();  // Invalid kind.
++
++  // No exception handler. Return error from entry frame.
++  bind(&entryFrame);
++  asMasm().moveValue(MagicValue(JS_ION_ERROR), JSReturnOperand);
++  loadPtr(Address(StackPointer, ResumeFromException::offsetOfFramePointer()),
++          FramePointer);
++  loadPtr(Address(StackPointer, ResumeFromException::offsetOfStackPointer()),
++          StackPointer);
++  ret();
++
++  // Catch handler.
++  bind(&catch_);
++  loadPtr(Address(StackPointer, ResumeFromException::offsetOfTarget()), r3);
++  loadPtr(Address(StackPointer, ResumeFromException::offsetOfFramePointer()),
++          FramePointer);
++  loadPtr(Address(StackPointer, ResumeFromException::offsetOfStackPointer()),
++          StackPointer);
++  jump(r3);
++
++  // Finally block.
++  bind(&finally);
++  ValueOperand exception = ValueOperand(r4);
++  loadValue(Address(StackPointer, ResumeFromException::offsetOfException()),
++            exception);
++
++  ValueOperand exceptionStack = ValueOperand(r5);
++  loadValue(
++      Address(StackPointer, ResumeFromException::offsetOfExceptionStack()),
++      exceptionStack);
++
++  loadPtr(Address(StackPointer, ResumeFromException::offsetOfTarget()), r3);
++  loadPtr(Address(StackPointer, ResumeFromException::offsetOfFramePointer()),
++          FramePointer);
++  loadPtr(Address(StackPointer, ResumeFromException::offsetOfStackPointer()),
++          StackPointer);
++
++  pushValue(exception);
++  pushValue(exceptionStack);
++  pushValue(BooleanValue(true));
++  jump(r3);
++
++  // Forced return from baseline.
++  Label profilingInstrumentation;
++  bind(&returnBaseline);
++  loadPtr(Address(StackPointer, ResumeFromException::offsetOfFramePointer()),
++          FramePointer);
++  loadPtr(Address(StackPointer, ResumeFromException::offsetOfStackPointer()),
++          StackPointer);
++  loadValue(Address(FramePointer, BaselineFrame::reverseOffsetOfReturnValue()),
++            JSReturnOperand);
++  jump(&profilingInstrumentation);
++
++  // Forced return from Ion.
++  bind(&returnIon);
++  loadValue(Address(StackPointer, ResumeFromException::offsetOfException()),
++            JSReturnOperand);
++  loadPtr(Address(StackPointer, ResumeFromException::offsetOfFramePointer()),
++          FramePointer);
++  loadPtr(Address(StackPointer, ResumeFromException::offsetOfStackPointer()),
++          StackPointer);
++
++  bind(&profilingInstrumentation);
++  {
++    Label skipProfilingInstrumentation;
++    AbsoluteAddress addressOfEnabled(
++        asMasm().runtime()->geckoProfiler().addressOfEnabled());
++    asMasm().branch32(Assembler::Equal, addressOfEnabled, Imm32(0),
++                      &skipProfilingInstrumentation);
++    jump(profilerExitTail);
++    bind(&skipProfilingInstrumentation);
++  }
++
++  xs_mr(StackPointer, FramePointer);
++  // Pop FP from stack, then return (pop LR + blr).
++  loadPtr(Address(StackPointer, 0), FramePointer);
++  asMasm().addPtr(Imm32(sizeof(void*)), StackPointer);
++  ret();
++
++  // Bailout.
++  bind(&bailout);
++  loadPtr(Address(StackPointer, ResumeFromException::offsetOfBailoutInfo()),
++          r5);
++  loadPtr(Address(StackPointer, ResumeFromException::offsetOfStackPointer()),
++          StackPointer);
++  xs_li(ReturnReg, 1);
++  jump(bailoutTail);
++
++  // Wasm interp entry.
++  bind(&wasmInterpEntry);
++  loadPtr(Address(StackPointer, ResumeFromException::offsetOfFramePointer()),
++          FramePointer);
++  loadPtr(Address(StackPointer, ResumeFromException::offsetOfStackPointer()),
++          StackPointer);
++  movePtr(ImmWord(wasm::InterpFailInstanceReg), InstanceReg);
++  ret();
++
++  // Wasm catch.
++  bind(&wasmCatch);
++  wasm::GenerateJumpToCatchHandler(asMasm(), StackPointer, r4, r5, r6);
++}
++
++void MacroAssembler::clampDoubleToUint8(FloatRegister input, Register output) {
++  ScratchDoubleScope fpscratch(asMasm());
++
++  if (HasPOWER9()) {
++    // P9 xsmaxjdp uses Java/JS semantics (ISA v3.0B): any NaN
++    // is treated as "less than any number that is not a NaN", so
++    // xsmaxjdp(input, 0) collapses {NaN, -Inf, ≤ 0} to 0 in one insn —
++    // the "≤ 0 or NaN → 0" branch dance disappears.
++    //
++    // After the max, fctid (round-to-nearest-even per FPSCR default,
++    // matches ECMA Uint8ClampedArray's round-half-to-even) saturates
++    // out-of-int64 values to INT64_MAX. Remaining upper clamp
++    // (output > 255 → 255) is one cmpdi + isel.
++    zeroDouble(fpscratch);
++    as_xsmaxjdp(fpscratch, input, fpscratch);
++    as_fctid(fpscratch, fpscratch);
++    as_mfvsrd(output, fpscratch);
++    UseScratchRegisterScope temps(asMasm());
++    Register max255 = temps.Acquire();
++    xs_li(max255, 255);
++    as_cmpdi(output, 255);
++    as_isel(output, max255, output, GreaterThan);
++    return;
++  }
++
++  // POWER8 fallback: xsmaxjdp is unavailable, so filter NaN explicitly
++  // before fctid. Per Power ISA, fctid maps NaN to INT64_MAX, which
++  // would clamp to 255 instead of the spec-required 0.
++  Label positive, below255, done;
++  zeroDouble(fpscratch);
++  branchDouble(DoubleGreaterThan, input, fpscratch, &positive);
++  {
++    move32(Imm32(0), output);
++    jump(&done);
++  }
++
++  bind(&positive);
++
++  loadConstantDouble(255.0, fpscratch);
++  branchDouble(DoubleLessThan, input, fpscratch, &below255);
++  {
++    move32(Imm32(255), output);
++    jump(&done);
++  }
++
++  bind(&below255);
++
++  as_fctid(fpscratch, input);
++  as_mfvsrd(output, fpscratch);
++  bind(&done);
++}
++
++void MacroAssembler::subFromStackPtr(Imm32 imm32) {
++  if (imm32.value) {
++    asMasm().subPtr(imm32, StackPointer);
++  }
++}
++
++//{{{ check_macroassembler_style
++
++void MacroAssembler::widenInt32(Register r) {
++  move32To64SignExtend(r, Register64(r));
++}
++
++// Stack operations.
++void MacroAssembler::Push(Register reg) {
++  push(reg);
++  adjustFrame(int32_t(sizeof(intptr_t)));
++}
++void MacroAssembler::Push(const Imm32 imm) {
++  push(imm);
++  adjustFrame(int32_t(sizeof(intptr_t)));
++}
++
++void MacroAssembler::Push(const ImmWord imm) {
++  push(imm);
++  adjustFrame(int32_t(sizeof(intptr_t)));
++}
++
++void MacroAssembler::Push(const ImmPtr imm) {
++  Push(ImmWord(uintptr_t(imm.value)));
++}
++
++void MacroAssembler::Push(const ImmGCPtr ptr) {
++  push(ptr);
++  adjustFrame(int32_t(sizeof(intptr_t)));
++}
++
++void MacroAssembler::PushBoxed(FloatRegister reg) {
++  subFromStackPtr(Imm32(sizeof(double)));
++  boxDouble(reg, Address(getStackPointer(), 0));
++  adjustFrame(sizeof(double));
++}
++
++void MacroAssembler::Pop(Register reg) {
++  pop(reg);
++  adjustFrame(-int32_t(sizeof(intptr_t)));
++}
++void MacroAssembler::PushRegsInMask(LiveRegisterSet set) {
++  int32_t diff =
++      set.gprs().size() * sizeof(intptr_t) + set.fpus().getPushSizeInBytes();
++  const int32_t reserved = diff;
++
++  reserveStack(reserved);
++  for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more(); ++iter) {
++    diff -= sizeof(intptr_t);
++    storePtr(*iter, Address(StackPointer, diff));
++  }
++
++  // Natural per-kind slot — 8 bytes for Single/Double via stfd, 16 bytes
++  // for Simd128 via stxvx. RegisterDump::FPUArray is sized 32 × 8 = 256
++  // bytes (sizeof(RegisterContent) is 8 — no v128 in the union), so
++  // f_K's stfd slot lands at the right offset. Bailout AllRegs excludes
++  // Simd128 (Ion has no SIMD live), so the FP region in bailout frames
++  // is strictly Float-only.
++  for (FloatRegisterBackwardIterator iter(set.fpus().reduceSetForPush());
++       iter.more(); ++iter) {
++    FloatRegister reg = *iter;
++    diff -= reg.size();
++    if (reg.isSimd128()) {
++      storeUnalignedSimd128(reg, Address(StackPointer, diff));
++    } else {
++      storeDouble(reg.asDouble(), Address(StackPointer, diff));
++    }
++  }
++  MOZ_ASSERT(diff == 0);
++}
++void MacroAssembler::PopRegsInMaskIgnore(LiveRegisterSet set,
++                                         LiveRegisterSet ignore) {
++  int32_t diff =
++      set.gprs().size() * sizeof(intptr_t) + set.fpus().getPushSizeInBytes();
++  const int32_t reserved = diff;
++
++  for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more(); ++iter) {
++    diff -= sizeof(intptr_t);
++    if (!ignore.has(*iter)) {
++      loadPtr(Address(StackPointer, diff), *iter);
++    }
++  }
++
++  // Natural per-kind slot. See PushRegsInMask comment.
++  for (FloatRegisterBackwardIterator iter(set.fpus().reduceSetForPush());
++       iter.more(); ++iter) {
++    FloatRegister reg = *iter;
++    diff -= reg.size();
++    if (!ignore.has(reg)) {
++      if (reg.isSimd128()) {
++        loadUnalignedSimd128(Address(StackPointer, diff), reg);
++      } else {
++        loadDouble(Address(StackPointer, diff), reg.asDouble());
++      }
++    }
++  }
++  MOZ_ASSERT(diff == 0);
++  freeStack(reserved);
++}
++
++// Call operations.
++CodeOffset MacroAssembler::call(Register reg) {
++  // ELFv2 ABI: r12 must hold the target address at function entry
++  // so the callee can compute its TOC pointer from r12.
++  if (reg != CallReg) {
++    movePtr(reg, CallReg);
++  }
++  xs_mtctr(CallReg);
++  as_bctr(LinkB);
++  return CodeOffset(currentOffset());
++}
++CodeOffset MacroAssembler::call(Label* label) {
++  if (label->bound()) {
++    // Open the no-pool window BEFORE computing the displacement.
++    // enterNoPool() can itself trigger a pending pool flush, advancing
++    // currentOffset(). A pre-flush displacement emitted at the post-flush
++    // position would overshoot the target by poolSize bytes.
++    m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
++    int32_t offset = label->offset() - currentOffset();
++    // Call instruction goes at inst[9] in the 10-word stanza.
++    int32_t callOffset = offset - 9 * (int32_t)sizeof(uint32_t);
++    if (JOffImm26::IsInRange(callOffset)) {
++      // Short: 9 nops + bl = 10 instructions.
++      writeInst(NopInst);
++      writeInst(NopInst);
++      writeInst(NopInst);
++      writeInst(NopInst);
++      writeInst(NopInst);
++      writeInst(NopInst);
++      writeInst(NopInst);
++      writeInst(NopInst);
++      writeInst(NopInst);
++      as_b(JOffImm26(callOffset), RelativeBranch, LinkB);
++      m_buffer.leaveNoPool();
++      return CodeOffset(currentOffset());
++    }
++    // Long call to bound label: stanza(8) + mtctr + bctrl = 10 instructions.
++    BufferOffset bo =
++        emitLoad64Stanza(SecondScratchReg, LabelBase::INVALID_OFFSET);
++    xs_mtctr(SecondScratchReg);
++    as_bctr(LinkB);
++    m_buffer.leaveNoPool();
++    addLongJump(bo, BufferOffset(label->offset()));
++    return CodeOffset(currentOffset());
++  }
++  // Emit a CallTag stanza: trap + chain + 8 nops (10 instructions total).
++  m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
++  BufferOffset bo = xs_trap_tagged(CallTag);
++  writeInst(label->used() ? label->offset() : LabelBase::INVALID_OFFSET);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  m_buffer.leaveNoPool();
++  if (!oom()) {
++    label->use(bo.getOffset());
++  }
++  return CodeOffset(currentOffset());
++}
++CodeOffset MacroAssembler::call(const Address& addr) {
++  loadPtr(addr, CallReg);
++  return call(CallReg);
++}
++
++void MacroAssembler::call(ImmPtr target) {
++  uint64_t addr = uintptr_t(target.value);
++  // stanza(8) + mtctr + bctrl = 10 instructions.
++  m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
++  BufferOffset bo = emitLoad64Stanza(CallReg, addr);
++  addPendingJump(bo, target, RelocationKind::HARDCODED);
++  xs_mtctr(CallReg);
++  as_bctr(LinkB);
++  m_buffer.leaveNoPool();
++}
++
++CodeOffset MacroAssembler::call(wasm::SymbolicAddress target) {
++  movePtr(target, CallReg);
++  return call(CallReg);
++}
++
++void MacroAssembler::callWithABINoProfiler(const Address& fun, ABIType result) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(fun, scratch);
++
++  uint32_t stackAdjust;
++  callWithABIPre(&stackAdjust);
++  call(scratch);
++  callWithABIPost(stackAdjust, result);
++}
++
++void MacroAssembler::callWithABIPre(uint32_t* stackAdjust, bool callFromWasm) {
++  MOZ_ASSERT(inCall_);
++  uint32_t stackForCall = abiArgs_.stackBytesConsumedSoFar();
++
++  // Reserve place for LR save.
++  stackForCall += sizeof(intptr_t);
++
++  if (dynamicAlignment_) {
++    stackForCall += ComputeByteAlignment(stackForCall, ABIStackAlignment);
++  } else {
++    uint32_t alignmentAtPrologue = callFromWasm ? sizeof(wasm::Frame) : 0;
++    stackForCall += ComputeByteAlignment(
++        stackForCall + framePushed() + alignmentAtPrologue, ABIStackAlignment);
++  }
++
++  *stackAdjust = stackForCall;
++  reserveStack(stackForCall);
++
++  // Save LR. Restore it in callWithABIPost.
++  {
++    UseScratchRegisterScope temps(asMasm());
++    Register scratch = temps.Acquire();
++    xs_mflr(scratch);
++    storePtr(scratch, Address(StackPointer, stackForCall - sizeof(intptr_t)));
++  }
++
++  // Position all arguments.
++  {
++    enoughMemory_ &= moveResolver_.resolve();
++    if (!enoughMemory_) {
++      return;
++    }
++
++    MoveEmitter emitter(*this);
++    emitter.emit(moveResolver_);
++    emitter.finish();
++  }
++
++  assertStackAlignment(ABIStackAlignment);
++}
++
++void MacroAssembler::callWithABIPost(uint32_t stackAdjust, ABIType result) {
++  {
++    UseScratchRegisterScope temps(asMasm());
++    Register scratch = temps.Acquire();
++    loadPtr(Address(StackPointer, stackAdjust - sizeof(intptr_t)), scratch);
++    xs_mtlr(scratch);
++  }
++
++  if (dynamicAlignment_) {
++    // Restore SP from stack (as stored in setupUnalignedABICall).
++    loadPtr(Address(StackPointer, stackAdjust), StackPointer);
++    adjustFrame(-stackAdjust);
++  } else {
++    freeStack(stackAdjust);
++  }
++
++#ifdef DEBUG
++  MOZ_ASSERT(inCall_);
++  inCall_ = false;
++#endif
++}
++
++// Value operations.
++void MacroAssembler::moveValue(const ValueOperand& src,
++                               const ValueOperand& dest) {
++  if (src.valueReg() != dest.valueReg()) {
++    movePtr(src.valueReg(), dest.valueReg());
++  }
++}
++void MacroAssembler::moveValue(const Value& src, const ValueOperand& dest) {
++  if (!src.isGCThing()) {
++    movePtr(ImmWord(src.asRawBits()), dest.valueReg());
++    return;
++  }
++  CodeOffset off = movWithPatch(ImmWord(src.asRawBits()), dest.valueReg());
++  writeDataRelocation(off, src);
++}
++
++// Branch operations.
++void MacroAssembler::branchTestValue(Condition cond, const ValueOperand& lhs,
++                                     const Value& rhs, Label* label) {
++  MOZ_ASSERT(cond == Equal || cond == NotEqual);
++  MOZ_ASSERT(!rhs.isNaN());
++
++  if (!rhs.isGCThing()) {
++    UseScratchRegisterScope temps(asMasm());
++    Register scratch = temps.Acquire();
++    MOZ_ASSERT(lhs.valueReg() != scratch);
++    movePtr(ImmWord(rhs.asRawBits()), scratch);
++    branchPtr(cond, lhs.valueReg(), scratch, label);
++  } else {
++    UseScratchRegisterScope temps(asMasm());
++    Register scratch = temps.Acquire();
++    MOZ_ASSERT(lhs.valueReg() != scratch);
++    moveValue(rhs, ValueOperand(scratch));
++    branchPtr(cond, lhs.valueReg(), scratch, label);
++  }
++}
++void MacroAssembler::branchTestNaNValue(Condition cond, const ValueOperand& val,
++                                        Register temp, Label* label) {
++  MOZ_ASSERT(cond == Equal || cond == NotEqual);
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  MOZ_ASSERT(val.valueReg() != scratch);
++
++  // Strip the IEEE sign bit (LSB-numbering bit 63 = PPC-numbering bit 0)
++  // with rldicl SH=0, MB=1: rotate by zero (no-op) then keep bits 1..63 of
++  // PPC-numbering, clearing bit 0. Rotating by 1 instead would also shift
++  // the quiet-NaN bit out of position and cause 1.5 (0x3FF8...) and NaN
++  // (0x7FF8...) to collide after masking — bug 1943704 PPC64 regression.
++  as_rldicl(temp, val.valueReg(), 0, 1);
++
++  // Load canonical NaN (with sign bit 0) and strip its sign bit too.
++  static_assert(JS::detail::CanonicalizedNaNSignBit == 0);
++  moveValue(DoubleValue(JS::GenericNaN()), ValueOperand(scratch));
++  as_rldicl(scratch, scratch, 0, 1);
++
++  branchPtr(cond, temp, scratch, label);
++}
++
++void MacroAssembler::branchPtrInNurseryChunk(Condition cond, Register ptr,
++                                             Register temp, Label* label) {
++  MOZ_ASSERT(cond == Assembler::Equal || cond == Assembler::NotEqual);
++  MOZ_ASSERT(ptr != temp);
++  MOZ_ASSERT(temp != InvalidReg);
++
++  andPtr(Imm32(int32_t(~gc::ChunkMask)), ptr, temp);
++  branchPtr(InvertCondition(cond), Address(temp, gc::ChunkStoreBufferOffset),
++            ImmWord(0), label);
++}
++void MacroAssembler::branchValueIsNurseryCell(Condition cond,
++                                              ValueOperand value, Register temp,
++                                              Label* label) {
++  branchValueIsNurseryCellImpl(cond, value, temp, label);
++}
++
++// Patching / near address operations.
++CodeOffset MacroAssembler::nopPatchableToCall() {
++  // Emit 10 nops that can be patched to a call stanza:
++  // 8 load64 nops + mtctr nop + bctrl nop
++  // Return offset AFTER the stanza (= the return address).
++  m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  m_buffer.leaveNoPool();
++  return CodeOffset(currentOffset());
++}
++CodeOffset MacroAssembler::moveNearAddressWithPatch(Register dest) {
++  CodeOffset offset(currentOffset());
++  emitLoad64Stanza(dest, 0);
++  return offset;
++}
++// static
++void MacroAssembler::patchNearAddressMove(CodeLocationLabel loc,
++                                          CodeLocationLabel target) {
++  Instruction* inst = (Instruction*)loc.raw();
++  UpdateLoad64Value(inst, (uint64_t)target.raw());
++}
++
++// Return address operations (link register architectures).
++//
++// Note: these MUST decrement SP by exactly 8 bytes. wasm::Frame is 16 bytes
++// (callerFP_ + returnAddress_) and GenerateCallablePrologue pairs this with
++// push(FramePointer) to match that layout exactly — a 16-byte decrement here
++// would insert 8 bytes of padding and break FP-chain unwinding. The 8-byte
++// intermediate misalignment between this save and the following push(FP) is
++// never observed by a C call (no intervening transition), and any caller that
++// does make a C call after pushReturnAddress routes through
++// setupUnalignedABICall which re-aligns.
++void MacroAssembler::pushReturnAddress() {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  xs_mflr(scratch);
++  push(scratch);
++}
++void MacroAssembler::popReturnAddress() {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  pop(scratch);
++  xs_mtlr(scratch);
++}
++
++// ABI setup.
++void MacroAssembler::setupUnalignedABICall(Register scratch) {
++  MOZ_ASSERT(!IsCompilingWasm(), "wasm should only use aligned ABI calls");
++  setupNativeABICall();
++  dynamicAlignment_ = true;
++
++  movePtr(StackPointer, scratch);
++
++  // Force sp to be aligned.
++  subPtr(Imm32(sizeof(uintptr_t)), StackPointer);
++  andPtr(Imm32(~(ABIStackAlignment - 1)), StackPointer);
++  storePtr(scratch, Address(StackPointer, 0));
++}
++
++// ===============================================================
++// Arithmetic helpers.
++
++void MacroAssembler::flexibleDivMod32(Register lhs, Register rhs,
++                                      Register divOutput, Register remOutput,
++                                      bool isUnsigned, const LiveRegisterSet&) {
++  MOZ_ASSERT(lhs != divOutput && lhs != remOutput, "lhs is preserved");
++  MOZ_ASSERT(rhs != divOutput && rhs != remOutput, "rhs is preserved");
++
++  // PPC64 has no modulus instruction. Compute: rem = lhs - (lhs/rhs)*rhs
++  // PPC64 divw(INT32_MIN, -1) is undefined; quotient=INT32_MIN, remainder=0.
++  Label done;
++  if (!isUnsigned) {
++    Label notMinOverflow;
++    branchPtr(Assembler::NotEqual, lhs, ImmWord(INT32_MIN), &notMinOverflow);
++    branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), &notMinOverflow);
++    move32(Imm32(INT32_MIN), divOutput);
++    move32(Imm32(0), remOutput);
++    jump(&done);
++    bind(&notMinOverflow);
++  }
++  if (isUnsigned) {
++    as_divwu(divOutput, lhs, rhs);
++  } else {
++    as_divw(divOutput, lhs, rhs);
++  }
++  as_extsw(divOutput, divOutput);
++  if (HasPOWER9()) {
++    if (isUnsigned) {
++      as_moduw(remOutput, lhs, rhs);
++    } else {
++      as_modsw(remOutput, lhs, rhs);
++    }
++  } else {
++    as_mullw(remOutput, divOutput, rhs);
++    as_subf(remOutput, remOutput, lhs);
++  }
++  as_extsw(remOutput, remOutput);
++  bind(&done);
++}
++
++void MacroAssembler::shiftIndex32AndAdd(Register indexTemp32, int shift,
++                                        Register pointer) {
++  if (IsShiftInScaleRange(shift)) {
++    computeEffectiveAddress(
++        BaseIndex(pointer, indexTemp32, ShiftToScale(shift)), pointer);
++    return;
++  }
++  lshift32(Imm32(shift), indexTemp32);
++  addPtr(indexTemp32, pointer);
++}
++
++void MacroAssembler::convertInt64ToDouble(Register64 src, FloatRegister dest) {
++  as_mtvsrd(dest, src.reg);
++  as_fcfid(dest, dest);
++}
++
++void MacroAssembler::nearbyIntDouble(RoundingMode mode, FloatRegister src,
++                                     FloatRegister dest) {
++  switch (mode) {
++    case RoundingMode::NearestTiesToEven: {
++      // PPC64's frin rounds ties away from zero, NOT to even (ISA v3.1).
++      // Use fctid+fcfid which uses FPSCR RN (default = round-to-nearest-even).
++      // Guard: if |src| >= 2^52, value is already integral (or NaN/Inf) —
++      // just copy src. This preserves NaN, Inf, and -0.
++      // Check via integer exponent extraction to avoid FP temp conflicts.
++      Label done;
++      UseScratchRegisterScope temps(*this);
++      Register scratch = temps.Acquire();
++      moveDouble(src, ScratchDoubleReg);
++      if (src != dest) {
++        moveDouble(src, dest);
++      }
++      if (HasPOWER9()) {
++        // xsxexpdp lays the 11-bit biased exponent in XT.dw0 with the
++        // rest zeroed, so mfvsrd reads it directly — drops the
++        // srdi+andi. masking pair.
++        ScratchSimd128Scope expScratch(*this);
++        as_xsxexpdp(expScratch, ScratchDoubleReg);
++        as_mfvsrd(scratch, expScratch);
++      } else {
++        as_mfvsrd(scratch, ScratchDoubleReg);
++        x_srdi(scratch, scratch, 52);
++        as_andi_rc(scratch, scratch, 0x7FF);
++      }
++      // Biased exponent >= 1075 (= 1023+52) means |val| >= 2^52.
++      // Also catches Inf (exp=2047) and NaN (exp=2047).
++      ma_cmp(scratch, Imm32(1075), Assembler::GreaterThanOrEqual);
++      ma_b(Assembler::GreaterThanOrEqual, &done);
++      as_fctid(dest, ScratchDoubleReg);
++      as_fcfid(dest, dest);
++      as_fcpsgn(dest, ScratchDoubleReg, dest);
++      bind(&done);
++      break;
++    }
++    case RoundingMode::TowardsZero:
++      as_friz(dest, src);
++      break;
++    case RoundingMode::Up:
++      as_frip(dest, src);
++      break;
++    case RoundingMode::Down:
++      as_frim(dest, src);
++      break;
++    default:
++      MOZ_CRASH("Unexpected rounding mode");
++  }
++}
++
++void MacroAssembler::nearbyIntFloat32(RoundingMode mode, FloatRegister src,
++                                      FloatRegister dest) {
++  // PPC FP rounding instructions operate on double-precision.
++  // For single-precision, we round as double then round back to single.
++  // The frsp instruction handles the double->single conversion.
++  nearbyIntDouble(mode, src, dest);
++  as_frsp(dest, dest);
++}
++
++// ===============================================================
++// Far jump support.
++
++CodeOffset MacroAssembler::farJumpWithPatch() {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++
++  // stanza(8) + mtctr + bctr = 10 instructions.
++  CodeOffset loadOffset(currentOffset());
++  m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
++  emitLoad64Stanza(scratch, 0);
++  xs_mtctr(scratch);
++  as_bctr();
++  m_buffer.leaveNoPool();
++
++  return loadOffset;
++}
++
++// ===============================================================
++void MacroAssembler::flush() { Assembler::flush(); }
++
++// Wasm support.
++
++FaultingCodeOffset MacroAssembler::wasmTrapInstruction() {
++  m_buffer.flushPool();  // see comment in wasmLoadImpl
++  FaultingCodeOffset fco = FaultingCodeOffset(currentOffset());
++  xs_trap();
++  return fco;
++}
++
++// PPC64 SlowCallMarker: `ori r0, r0, 0` -- a NOP-like instruction
++// that won't appear in normal code generation.
++// ori r0, r0, 0 = 0x60000000 -- that's actually PPC_nop.
++// Use a distinguishable encoding: `ori r12, r12, 0` = 0x618C0000
++static const int32_t SlowCallMarker = 0x618C0000;
++
++void MacroAssembler::wasmMarkCallAsSlow() {
++  // Emit: ori r12, r12, 0
++  as_ori(CallReg, CallReg, 0);
++}
++
++void MacroAssembler::wasmCheckSlowCallsite(Register ra_, Label* notSlow,
++                                           Register temp1, Register temp2) {
++  MOZ_ASSERT(ra_ != temp2);
++  load32(Address(ra_, 0), temp2);
++  branch32(Assembler::NotEqual, temp2, Imm32(SlowCallMarker), notSlow);
++}
++
++CodeOffset MacroAssembler::wasmMarkedSlowCall(const wasm::CallSiteDesc& desc,
++                                              const Register reg) {
++  CodeOffset offset = call(desc, reg);
++  wasmMarkCallAsSlow();
++  return offset;
++}
++
++// ===============================================================
++// Additional stack operations.
++
++void MacroAssembler::Push(FloatRegister f) {
++  push(f);
++  adjustFrame(int32_t(sizeof(double)));
++}
++void MacroAssembler::Pop(FloatRegister f) {
++  pop(f);
++  adjustFrame(-int32_t(sizeof(double)));
++}
++void MacroAssembler::Pop(const ValueOperand& val) {
++  popValue(val);
++  adjustFrame(-int32_t(sizeof(Value)));
++}
++
++// static
++size_t MacroAssembler::PushRegsInMaskSizeInBytes(LiveRegisterSet set) {
++  return set.gprs().size() * sizeof(intptr_t) + set.fpus().getPushSizeInBytes();
++}
++
++void MacroAssembler::storeRegsInMask(LiveRegisterSet set, Address dest,
++                                     Register scratch) {
++  FloatRegisterSet fpuSet(set.fpus().reduceSetForPush());
++  mozilla::DebugOnly<unsigned> numFpu = fpuSet.size();
++  mozilla::DebugOnly<int32_t> diffF = fpuSet.getPushSizeInBytes();
++  mozilla::DebugOnly<int32_t> diffG = set.gprs().size() * sizeof(intptr_t);
++
++  MOZ_ASSERT(dest.offset >= diffG + diffF);
++
++  for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more(); ++iter) {
++    diffG -= sizeof(intptr_t);
++    dest.offset -= sizeof(intptr_t);
++    storePtr(*iter, dest);
++  }
++  MOZ_ASSERT(diffG == 0);
++
++  // Natural per-kind slot. See PushRegsInMask comment.
++  for (FloatRegisterBackwardIterator iter(fpuSet); iter.more(); ++iter) {
++    FloatRegister reg = *iter;
++    diffF -= reg.size();
++    numFpu -= 1;
++    dest.offset -= reg.size();
++    if (reg.isSimd128()) {
++      storeUnalignedSimd128(reg, dest);
++    } else {
++      storeDouble(reg.asDouble(), dest);
++    }
++  }
++  MOZ_ASSERT(diffF == 0);
++}
++
++void MacroAssembler::freeStackTo(uint32_t framePushed) {
++  MOZ_ASSERT(framePushed <= framePushed_);
++  // SP = FP - framePushed
++  movePtr(FramePointer, StackPointer);
++  if (framePushed) {
++    subPtr(Imm32(framePushed), StackPointer);
++  }
++  framePushed_ = framePushed;
++}
++
++// ===============================================================
++// Additional call / patch operations.
++
++void MacroAssembler::call(JitCode* c) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++
++  uint64_t addr = uintptr_t(c->raw());
++  BufferOffset bo = emitLoad64Stanza(scratch, addr);
++  addPendingJump(bo, ImmPtr(c->raw()), RelocationKind::JITCODE);
++
++  callJitNoProfiler(scratch);
++}
++
++CodeOffset MacroAssembler::callWithPatch() {
++  // Emit a CallTag-sized stanza of nops. Will be patched by patchCall.
++  // Return offset AFTER the stanza (= the return address when bl executes).
++  m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  writeInst(NopInst);
++  m_buffer.leaveNoPool();
++  return CodeOffset(currentOffset());
++}
++
++void MacroAssembler::patchCall(uint32_t callerOffset, uint32_t calleeOffset) {
++  // callerOffset points AFTER the 10-instruction stanza (the return address).
++  // Subtract to find the stanza start. The `bl` goes at inst[9].
++  uint32_t stanzaStart = callerOffset - 10 * sizeof(uint32_t);
++  Instruction* i0 = (Instruction*)(m_buffer.getInst(BufferOffset(stanzaStart)));
++  // bl offset is relative to inst[9], which is at stanzaStart + 36.
++  intptr_t blAddr = (intptr_t)stanzaStart + 9 * (intptr_t)sizeof(uint32_t);
++  intptr_t callOffset = (intptr_t)calleeOffset - blAddr;
++  if (JOffImm26::IsInRange(callOffset)) {
++    i0[0].makeNop();
++    i0[1].makeNop();
++    i0[2].makeNop();
++    i0[3].makeNop();
++    i0[4].makeNop();
++    i0[5].makeNop();
++    i0[6].makeNop();
++    i0[7].makeNop();
++    i0[8].makeNop();
++    i0[9].setData(PPC_b | JOffImm26(callOffset).encode() | LinkB);
++  } else {
++    addLongJump(BufferOffset(stanzaStart), BufferOffset(calleeOffset));
++    WriteLoad64Instructions(i0, SecondScratchReg, LabelBase::INVALID_OFFSET);
++    i0[8].makeOp_mtctr(SecondScratchReg);
++    i0[9].makeOp_bctr(LinkB);
++  }
++}
++
++void MacroAssembler::patchFarJump(CodeOffset farJump, uint32_t targetOffset) {
++  Instruction* inst =
++      (Instruction*)m_buffer.getInst(BufferOffset(farJump.offset()));
++  // Extract the destination register from the existing stanza. Both shapes
++  // encode rD at LE bits [21..25] of their first "register-touching" slot:
++  // P8 = mflr rD at [2], P9+ = addpcis rD at [0]. Major opcode of slot [0]
++  // distinguishes (31 = mfspr, 19 = addpcis).
++  uint32_t i0 = inst[0].encode();
++  uint32_t regCode = (((i0 >> 26) & 0x3f) == 19)
++                         ? ((i0 >> 21) & 0x1f)
++                         : ((inst[2].encode() >> 21) & 0x1f);
++  Register reg = Register::FromCode(regCode);
++  WriteLoad64Instructions(inst, reg, LabelBase::INVALID_OFFSET);
++  addLongJump(BufferOffset(farJump.offset()), BufferOffset(targetOffset));
++}
++
++// static
++void MacroAssembler::patchFarJump(uint8_t* farJump, uint8_t* target) {
++  UpdateLoad64Value((Instruction*)farJump, (uint64_t)(uintptr_t)target);
++  FlushICache(farJump, 8 * sizeof(Instruction));
++}
++
++// static
++void MacroAssembler::patchNopToCall(uint8_t* callsite, uint8_t* target) {
++  // callsite points AFTER the 10-instruction stanza. Subtract to find start.
++  Instruction* inst = (Instruction*)callsite - 10;
++  WriteLoad64Instructions(inst, SecondScratchReg, (uint64_t)(uintptr_t)target);
++  inst[8].makeOp_mtctr(SecondScratchReg);
++  inst[9].makeOp_bctr(LinkB);
++  FlushICache(inst, 10 * sizeof(Instruction));
++}
++
++// static
++void MacroAssembler::patchCallToNop(uint8_t* callsite) {
++  // callsite points AFTER the 10-instruction stanza. Subtract to find start.
++  Instruction* inst = (Instruction*)callsite - 10;
++  for (int i = 0; i < 10; i++) {
++    inst[i].makeNop();
++  }
++  FlushICache(inst, 10 * sizeof(Instruction));
++}
++
++void MacroAssembler::patchMove32(CodeOffset offset, Imm32 n) {
++  // Patch an 8-instruction load64 sequence with a 32-bit value.
++  Instruction* inst =
++      (Instruction*)m_buffer.getInst(BufferOffset(offset.offset()));
++  UpdateLoad64Value(inst, uint64_t(int64_t(n.value)));
++}
++
++uint32_t MacroAssembler::pushFakeReturnAddress(Register scratch) {
++  CodeLabel cl;
++
++  // Use mov(CodeLabel*, Register) which always emits a full 8-instruction
++  // load64 sequence (via NOPs + WriteLoad64Instructions). This is critical
++  // because movePtr(ImmWord(0)) would optimize to a single li instruction,
++  // but processCodeLabels->Bind->UpdateLoad64Value expects the full
++  // 8-instruction literal pool sequence at the patchAt offset.
++  mov(&cl, scratch);
++
++  Push(scratch);
++
++  bind(&cl);
++  uint32_t retAddr = currentOffset();
++
++  addCodeLabel(cl);
++  return retAddr;
++}
++
++void MacroAssembler::callWithABINoProfiler(Register fun, ABIType result) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  // Save fun to scratch since fun might be clobbered by callWithABIPre.
++  movePtr(fun, scratch);
++
++  uint32_t stackAdjust;
++  callWithABIPre(&stackAdjust);
++  call(scratch);
++  callWithABIPost(stackAdjust, result);
++}
++
++// ===============================================================
++// Additional arithmetic helpers.
++
++void MacroAssembler::flexibleRemainder32(Register lhs, Register rhs,
++                                         Register dest, bool isUnsigned,
++                                         const LiveRegisterSet&) {
++  // rem = lhs - (lhs/rhs)*rhs
++  // PPC64 divw(INT32_MIN, -1) is undefined; result is 0.
++  Label done;
++  if (!isUnsigned) {
++    Label notMinOverflow;
++    branchPtr(Assembler::NotEqual, lhs, ImmWord(INT32_MIN), &notMinOverflow);
++    branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), &notMinOverflow);
++    move32(Imm32(0), dest);
++    jump(&done);
++    bind(&notMinOverflow);
++  }
++  if (HasPOWER9()) {
++    if (isUnsigned) {
++      as_moduw(dest, lhs, rhs);
++    } else {
++      as_modsw(dest, lhs, rhs);
++    }
++  } else {
++    UseScratchRegisterScope temps(asMasm());
++    Register scratch = temps.Acquire();
++    if (isUnsigned) {
++      as_divwu(scratch, lhs, rhs);
++    } else {
++      as_divw(scratch, lhs, rhs);
++    }
++    as_mullw(scratch, scratch, rhs);
++    as_subf(dest, scratch, lhs);
++  }
++  as_extsw(dest, dest);
++  bind(&done);
++}
++
++void MacroAssembler::flexibleQuotientPtr(Register lhs, Register rhs,
++                                         Register dest, bool isUnsigned,
++                                         const LiveRegisterSet&) {
++  // PPC64 divd(INT64_MIN, -1) is undefined; return INT64_MIN to match
++  // ARM64/LoongArch64 hardware sdiv behavior.
++  Label done;
++  if (!isUnsigned) {
++    Label notMinOverflow;
++    branchPtr(Assembler::NotEqual, lhs, ImmWord(INT64_MIN), &notMinOverflow);
++    branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), &notMinOverflow);
++    movePtr(ImmWord(INT64_MIN), dest);
++    jump(&done);
++    bind(&notMinOverflow);
++  }
++  if (isUnsigned) {
++    as_divdu(dest, lhs, rhs);
++  } else {
++    as_divd(dest, lhs, rhs);
++  }
++  bind(&done);
++}
++
++void MacroAssembler::flexibleRemainderPtr(Register lhs, Register rhs,
++                                          Register dest, bool isUnsigned,
++                                          const LiveRegisterSet&) {
++  // rem = lhs - (lhs/rhs)*rhs
++  // PPC64 divd(INT64_MIN, -1) is undefined; result is 0.
++  Label done;
++  if (!isUnsigned) {
++    Label notMinOverflow;
++    branchPtr(Assembler::NotEqual, lhs, ImmWord(INT64_MIN), &notMinOverflow);
++    branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), &notMinOverflow);
++    movePtr(ImmWord(0), dest);
++    jump(&done);
++    bind(&notMinOverflow);
++  }
++  if (HasPOWER9()) {
++    if (isUnsigned) {
++      as_modud(dest, lhs, rhs);
++    } else {
++      as_modsd(dest, lhs, rhs);
++    }
++  } else {
++    UseScratchRegisterScope temps(asMasm());
++    Register scratch = temps.Acquire();
++    if (isUnsigned) {
++      as_divdu(scratch, lhs, rhs);
++    } else {
++      as_divd(scratch, lhs, rhs);
++    }
++    as_mulld(scratch, scratch, rhs);
++    as_subf(dest, scratch, lhs);
++  }
++  bind(&done);
++}
++
++// ===============================================================
++// Rounding helpers.
++
++void MacroAssembler::floorDoubleToInt32(FloatRegister src, Register dest,
++                                        Label* fail) {
++  ScratchDoubleScope fpscratch(asMasm());
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++
++  // Round toward negative infinity, then convert to int64.
++  as_frim(fpscratch, src);
++  as_fctidz(fpscratch, fpscratch);
++  as_mfvsrd(dest, fpscratch);
++
++  // Check if result fits in int32.
++  as_extsw(scratch, dest);
++  as_cmpd(dest, scratch);
++  ma_b(NotEqual, fail);
++
++  // Check for -0 and NaN when result is zero.
++  Label notZero;
++  as_cmpdi(dest, 0);
++  ma_b(NotEqual, &notZero);
++  {
++    // If top 2 bits of src are set, it's negative or NaN.
++    as_mfvsrd(dest, src);
++    // rldicl. = x_srdi + record form: dest = top 2 bits, CR0[eq]=(dest==0).
++    // Folds the explicit cmpdi src,0 that would otherwise drive the branch.
++    as_rldicl_rc(dest, dest, 2, 62);
++    ma_b(NotEqual, fail);
++  }
++  bind(&notZero);
++}
++
++void MacroAssembler::floorFloat32ToInt32(FloatRegister src, Register dest,
++                                         Label* fail) {
++  ScratchDoubleScope fpscratch(asMasm());
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++
++  // PPC FP rounding works on doubles. Single-precision FPRs are
++  // already in double-width registers, so frim works fine.
++  as_frim(fpscratch, src);
++  as_fctidz(fpscratch, fpscratch);
++  as_mfvsrd(dest, fpscratch);
++
++  // Check if result fits in int32.
++  as_extsw(scratch, dest);
++  as_cmpd(dest, scratch);
++  ma_b(NotEqual, fail);
++
++  // Check for -0 and NaN when result is zero.
++  Label notZero;
++  as_cmpdi(dest, 0);
++  ma_b(NotEqual, &notZero);
++  {
++    // src is held in the FPR as a 64-bit double (lfs widens float32 to
++    // double on load), so the same top-2-bits check used for doubles
++    // applies: bit 63 = sign, bit 62 = exponent MSB. Nonzero means -0,
++    // ±Inf, NaN, or a large magnitude — none of which is +0.
++    as_mfvsrd(dest, src);
++    // rldicl. = x_srdi + record form: dest = top 2 bits, CR0[eq]=(dest==0).
++    // Folds the explicit cmpdi src,0 that would otherwise drive the branch.
++    as_rldicl_rc(dest, dest, 2, 62);
++    ma_b(NotEqual, fail);
++  }
++  bind(&notZero);
++}
++
++void MacroAssembler::ceilDoubleToInt32(FloatRegister src, Register dest,
++                                       Label* fail) {
++  ScratchDoubleScope fpscratch(asMasm());
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++
++  as_frip(fpscratch, src);
++  as_fctidz(fpscratch, fpscratch);
++  as_mfvsrd(dest, fpscratch);
++
++  // Check if result fits in int32.
++  as_extsw(scratch, dest);
++  as_cmpd(dest, scratch);
++  ma_b(NotEqual, fail);
++
++  // Check for (-1, -0] and NaN when result is zero.
++  Label notZero;
++  as_cmpdi(dest, 0);
++  ma_b(NotEqual, &notZero);
++  {
++    // If binary value is not zero, input was not 0 (could be -0 or NaN).
++    as_mfvsrd(dest, src);
++    as_cmpdi(dest, 0);
++    ma_b(NotEqual, fail);
++  }
++  bind(&notZero);
++}
++
++void MacroAssembler::ceilFloat32ToInt32(FloatRegister src, Register dest,
++                                        Label* fail) {
++  ScratchDoubleScope fpscratch(asMasm());
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++
++  as_frip(fpscratch, src);
++  as_fctidz(fpscratch, fpscratch);
++  as_mfvsrd(dest, fpscratch);
++
++  // Check if result fits in int32.
++  as_extsw(scratch, dest);
++  as_cmpd(dest, scratch);
++  ma_b(NotEqual, fail);
++
++  // Check for (-1, -0] and NaN when result is zero.
++  Label notZero;
++  as_cmpdi(dest, 0);
++  ma_b(NotEqual, &notZero);
++  {
++    as_mfvsrd(dest, src);
++    as_cmpdi(dest, 0);
++    ma_b(NotEqual, fail);
++  }
++  bind(&notZero);
++}
++
++void MacroAssembler::truncDoubleToInt32(FloatRegister src, Register dest,
++                                        Label* fail) {
++  ScratchDoubleScope fpscratch(asMasm());
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++
++  as_fctidz(fpscratch, src);
++  as_mfvsrd(dest, fpscratch);
++
++  // Check if result fits in int32.
++  as_extsw(scratch, dest);
++  as_cmpd(dest, scratch);
++  ma_b(NotEqual, fail);
++
++  // Check for -0 and NaN when result is zero.
++  Label notZero;
++  as_cmpdi(dest, 0);
++  ma_b(NotEqual, &notZero);
++  {
++    as_mfvsrd(dest, src);
++    // rldicl. = x_srdi + record form: dest = top 2 bits, CR0[eq]=(dest==0).
++    // Folds the explicit cmpdi src,0 that would otherwise drive the branch.
++    as_rldicl_rc(dest, dest, 2, 62);
++    ma_b(NotEqual, fail);
++  }
++  bind(&notZero);
++}
++
++void MacroAssembler::truncFloat32ToInt32(FloatRegister src, Register dest,
++                                         Label* fail) {
++  ScratchDoubleScope fpscratch(asMasm());
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++
++  as_fctidz(fpscratch, src);
++  as_mfvsrd(dest, fpscratch);
++
++  // Check if result fits in int32.
++  as_extsw(scratch, dest);
++  as_cmpd(dest, scratch);
++  ma_b(NotEqual, fail);
++
++  // Check for -0 and NaN when result is zero.
++  Label notZero;
++  as_cmpdi(dest, 0);
++  ma_b(NotEqual, &notZero);
++  {
++    as_mfvsrd(dest, src);
++    // rldicl. = x_srdi + record form: dest = top 2 bits, CR0[eq]=(dest==0).
++    // Folds the explicit cmpdi src,0 that would otherwise drive the branch.
++    as_rldicl_rc(dest, dest, 2, 62);
++    ma_b(NotEqual, fail);
++  }
++  bind(&notZero);
++}
++
++void MacroAssembler::roundDoubleToInt32(FloatRegister src, Register dest,
++                                        FloatRegister temp, Label* fail) {
++  ScratchDoubleScope fpscratch(asMasm());
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++
++  Label negative, end, performRound;
++
++  // Branch for negative inputs.
++  zeroDouble(fpscratch);
++  branchDouble(DoubleGreaterThanOrEqual, src, fpscratch, &performRound);
++
++  // Input is negative.
++  loadConstantDouble(-0.5, fpscratch);
++  branchDouble(DoubleGreaterThanOrEqual, src, fpscratch, fail);
++  jump(&performRound);
++
++  bind(&performRound);
++  {
++    loadConstantDouble(GetBiggestNumberLessThan(0.5), temp);
++    as_fadd(fpscratch, src, temp);
++    as_frim(fpscratch, fpscratch);
++    as_fctidz(fpscratch, fpscratch);
++    as_mfvsrd(dest, fpscratch);
++
++    // Check if result fits in int32.
++    as_extsw(scratch, dest);
++    as_cmpd(dest, scratch);
++    ma_b(NotEqual, fail);
++  }
++  bind(&end);
++
++  // Check for -0 and NaN when result is zero.
++  Label notZero;
++  as_cmpdi(dest, 0);
++  ma_b(NotEqual, &notZero);
++  {
++    as_mfvsrd(dest, src);
++    as_cmpdi(dest, 0);
++    ma_b(NotEqual, fail);
++  }
++  bind(&notZero);
++}
++
++void MacroAssembler::roundFloat32ToInt32(FloatRegister src, Register dest,
++                                         FloatRegister temp, Label* fail) {
++  ScratchDoubleScope fpscratch(asMasm());
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++
++  Label negative, end, performRound;
++
++  // Branch for non-negative inputs.
++  loadConstantFloat32(0.0f, fpscratch);
++  branchFloat(DoubleGreaterThanOrEqual, src, fpscratch, &performRound);
++
++  // Input is negative.
++  loadConstantFloat32(-0.5f, fpscratch);
++  branchFloat(DoubleGreaterThanOrEqual, src, fpscratch, fail);
++  jump(&performRound);
++
++  bind(&performRound);
++  {
++    loadConstantFloat32(float(GetBiggestNumberLessThan(0.5)), temp);
++    as_fadds(fpscratch, src, temp);
++    as_frim(fpscratch, fpscratch);
++    as_fctidz(fpscratch, fpscratch);
++    as_mfvsrd(dest, fpscratch);
++
++    // Check if result fits in int32.
++    as_extsw(scratch, dest);
++    as_cmpd(dest, scratch);
++    ma_b(NotEqual, fail);
++  }
++  bind(&end);
++
++  // Check for -0 and NaN when result is zero.
++  Label notZero;
++  as_cmpdi(dest, 0);
++  ma_b(NotEqual, &notZero);
++  {
++    as_mfvsrd(dest, src);
++    as_cmpdi(dest, 0);
++    ma_b(NotEqual, fail);
++  }
++  bind(&notZero);
++}
++
++// ===============================================================
++// FP conversion / copy-sign.
++
++void MacroAssembler::convertIntPtrToDouble(Register src, FloatRegister dest) {
++  convertInt64ToDouble(Register64(src), dest);
++}
++
++void MacroAssembler::copySignDouble(FloatRegister lhs, FloatRegister rhs,
++                                    FloatRegister output) {
++  // fcpsgn frt, fra, frb: copies sign of fra to magnitude of frb.
++  // lhs = magnitude source, rhs = sign source.
++  as_fcpsgn(output, rhs, lhs);
++}
++
++void MacroAssembler::copySignFloat32(FloatRegister lhs, FloatRegister rhs,
++                                     FloatRegister output) {
++  as_fcpsgn(output, rhs, lhs);
++}
++
++// ===============================================================
++// GC / nursery helpers.
++
++void MacroAssembler::loadStoreBuffer(Register ptr, Register buffer) {
++  andPtr(Imm32(int32_t(~gc::ChunkMask)), ptr, buffer);
++  loadPtr(Address(buffer, gc::ChunkStoreBufferOffset), buffer);
++}
++
++void MacroAssembler::branchValueIsNurseryCell(Condition cond,
++                                              const Address& address,
++                                              Register temp, Label* label) {
++  branchValueIsNurseryCellImpl(cond, address, temp, label);
++}
++
++template <typename T>
++void MacroAssembler::branchValueIsNurseryCellImpl(Condition cond,
++                                                  const T& value, Register temp,
++                                                  Label* label) {
++  MOZ_ASSERT(cond == Assembler::Equal || cond == Assembler::NotEqual);
++  MOZ_ASSERT(temp != InvalidReg);
++  Label done;
++  branchTestGCThing(Assembler::NotEqual, value,
++                    cond == Assembler::Equal ? &done : label);
++
++  getGCThingValueChunk(value, temp);
++  loadPtr(Address(temp, gc::ChunkStoreBufferOffset), temp);
++  branchPtr(InvertCondition(cond), temp, ImmWord(0), label);
++
++  bind(&done);
++}
++
++// ===============================================================
++// Template instantiations.
++
++template <typename T>
++void MacroAssembler::storeUnboxedValue(const ConstantOrRegister& value,
++                                       MIRType valueType, const T& dest) {
++  MOZ_ASSERT(valueType < MIRType::Value);
++
++  if (valueType == MIRType::Double) {
++    boxDouble(value.reg().typedReg().fpu(), dest);
++    return;
++  }
++
++  if (value.constant()) {
++    storeValue(value.value(), dest);
++  } else {
++    storeValue(ValueTypeFromMIRType(valueType), value.reg().typedReg().gpr(),
++               dest);
++  }
++}
++
++template void MacroAssembler::storeUnboxedValue(const ConstantOrRegister& value,
++                                                MIRType valueType,
++                                                const Address& dest);
++template void MacroAssembler::storeUnboxedValue(
++    const ConstantOrRegister& value, MIRType valueType,
++    const BaseObjectElementIndex& dest);
++
++// ===============================================================
++// Misc stubs.
++
++void MacroAssembler::comment(const char* msg) {}
++
++void MacroAssembler::speculationBarrier() {
++  // isync provides execution synchronization: discards prefetched
++  // instructions and forces a refetch+reexecute past the barrier.
++  // No instruction following isync may begin (architecturally) until
++  // isync completes, blocking speculative bypass — exactly the
++  // Spectre v1 guarantee needed after a C call returns a value that
++  // may influence subsequent loads. Reachable from shared
++  // CodeGenerator under JitOptions.spectreJitToCxxCalls.
++  as_isync();
++}
++
++void MacroAssembler::atomicPause() { nop(); }
++
++void MacroAssembler::enterFakeExitFrameForWasm(Register cxreg, Register scratch,
++                                               ExitFrameType type) {
++  enterFakeExitFrame(cxreg, scratch, type);
++}
++
++void MacroAssembler::wasmBoundsCheck32(Condition cond, Register index,
++                                       Register boundsCheckLimit,
++                                       Label* label) {
++  ma_cmp(index, boundsCheckLimit, cond);
++  ma_b(cond, label);
++}
++
++void MacroAssembler::wasmBoundsCheck32(Condition cond, Register index,
++                                       Address boundsCheckLimit, Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  load32(boundsCheckLimit, scratch);
++  ma_cmp(index, scratch, cond);
++  ma_b(cond, label);
++}
++
++void MacroAssembler::wasmBoundsCheck64(Condition cond, Register64 index,
++                                       Register64 boundsCheckLimit,
++                                       Label* label) {
++  ma_cmp(index.reg, boundsCheckLimit.reg, cond);
++  ma_b(cond, label);
++}
++
++void MacroAssembler::wasmBoundsCheck64(Condition cond, Register64 index,
++                                       Address boundsCheckLimit, Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  loadPtr(boundsCheckLimit, scratch);
++  ma_cmp(index.reg, scratch, cond);
++  ma_b(cond, label);
++}
++
++CodeOffset MacroAssembler::move32WithPatch(Register dest) {
++  CodeOffset offset(currentOffset());
++  emitLoad64Stanza(dest, 0);
++  return offset;
++}
++
++CodeOffset MacroAssembler::sub32FromMemAndBranchIfNegativeWithPatch(
++    Address address, Label* label) {
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  MOZ_ASSERT(scratch != address.base);
++  load32(address, scratch);
++  // Subtract a placeholder value (will be patched).
++  // Use addi with positive placeholder (128), which will be patched to
++  // addi with negative value. The immediate is in the addi instruction.
++  as_addi(scratch, scratch, 128);
++  CodeOffset patchPoint = CodeOffset(currentOffset());
++  store32(scratch, address);
++  // Branch if result is negative (signed).
++  as_cmpwi(scratch, 0);
++  ma_b(LessThan, label);
++  return patchPoint;
++}
++
++bool MacroAssembler::convertUInt64ToDoubleNeedsTemp() { return false; }
++
++void MacroAssembler::call(ImmWord imm) { call(ImmPtr((void*)imm.value)); }
++
++void MacroAssembler::convertUInt64ToDouble(Register64 src, FloatRegister dest,
++                                           Register temp) {
++  MOZ_ASSERT(temp == Register::Invalid());
++  // POWER7+ has fcfidu (unsigned i64 → f64) as a single instruction; no
++  // sign-split / branch / GPR scratch needed.
++  as_mtvsrd(dest, src.reg);
++  as_fcfidu(dest, dest);
++}
++
++void MacroAssembler::convertInt64ToFloat32(Register64 src, FloatRegister dest) {
++  as_mtvsrd(dest, src.reg);
++  as_fcfids(dest, dest);
++}
++
++void MacroAssembler::convertUInt64ToFloat32(Register64 src, FloatRegister dest,
++                                            Register temp) {
++  MOZ_ASSERT(temp == Register::Invalid());
++  // POWER7+ has fcfidus (unsigned i64 → f32) as a single instruction.
++  as_mtvsrd(dest, src.reg);
++  as_fcfidus(dest, dest);
++}
++
++void MacroAssembler::flexibleQuotient32(
++    Register lhs, Register rhs, Register dest, bool isUnsigned,
++    const LiveRegisterSet& volatileLiveRegs) {
++  // PPC64 divw(INT32_MIN, -1) is undefined; return INT32_MIN to match
++  // ARM64/LoongArch64 hardware sdiv behavior.
++  Label done;
++  if (!isUnsigned) {
++    Label notMinOverflow;
++    branchPtr(Assembler::NotEqual, lhs, ImmWord(INT32_MIN), &notMinOverflow);
++    branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), &notMinOverflow);
++    move32(Imm32(INT32_MIN), dest);
++    jump(&done);
++    bind(&notMinOverflow);
++  }
++  if (isUnsigned) {
++    as_divwu(dest, lhs, rhs);
++  } else {
++    as_divw(dest, lhs, rhs);
++  }
++  as_extsw(dest, dest);
++  bind(&done);
++}
++
++void MacroAssembler::oolWasmTruncateCheckF32ToI32(
++    FloatRegister input, Register output, TruncFlags flags,
++    const wasm::TrapSiteDesc& trapSiteDesc, Label* rejoin) {
++  outOfLineWasmTruncateToInt32Check(input, output, MIRType::Float32, flags,
++                                    rejoin, trapSiteDesc);
++}
++
++void MacroAssembler::oolWasmTruncateCheckF32ToI64(
++    FloatRegister input, Register64 output, TruncFlags flags,
++    const wasm::TrapSiteDesc& trapSiteDesc, Label* rejoin) {
++  outOfLineWasmTruncateToInt64Check(input, output, MIRType::Float32, flags,
++                                    rejoin, trapSiteDesc);
++}
++
++void MacroAssembler::oolWasmTruncateCheckF64ToI32(
++    FloatRegister input, Register output, TruncFlags flags,
++    const wasm::TrapSiteDesc& trapSiteDesc, Label* rejoin) {
++  outOfLineWasmTruncateToInt32Check(input, output, MIRType::Double, flags,
++                                    rejoin, trapSiteDesc);
++}
++
++void MacroAssembler::oolWasmTruncateCheckF64ToI64(
++    FloatRegister input, Register64 output, TruncFlags flags,
++    const wasm::TrapSiteDesc& trapSiteDesc, Label* rejoin) {
++  outOfLineWasmTruncateToInt64Check(input, output, MIRType::Double, flags,
++                                    rejoin, trapSiteDesc);
++}
++
++void MacroAssemblerPPC64Compat::outOfLineWasmTruncateToInt32Check(
++    FloatRegister input, Register output, MIRType fromType, TruncFlags flags,
++    Label* rejoin, const wasm::TrapSiteDesc& trapSiteDesc) {
++  bool isUnsigned = flags & TRUNC_UNSIGNED;
++  bool isSaturating = flags & TRUNC_SATURATING;
++
++  if (isSaturating) {
++    ScratchDoubleScope fpscratch(asMasm());
++    if (fromType == MIRType::Double) {
++      asMasm().loadConstantDouble(0.0, fpscratch);
++    } else {
++      asMasm().loadConstantFloat32(0.0f, fpscratch);
++    }
++
++    if (isUnsigned) {
++      // If input < 0 or NaN, output = 0; else output = UINT32_MAX.
++      Label notNegOrNaN;
++      if (fromType == MIRType::Double) {
++        asMasm().branchDouble(Assembler::DoubleGreaterThanOrEqual, input,
++                              fpscratch, &notNegOrNaN);
++      } else {
++        asMasm().branchFloat(Assembler::DoubleGreaterThanOrEqual, input,
++                             fpscratch, &notNegOrNaN);
++      }
++      asMasm().move32(Imm32(0), output);
++      asMasm().jump(rejoin);
++      asMasm().bind(&notNegOrNaN);
++      asMasm().move32(Imm32(UINT32_MAX), output);
++    } else {
++      // Signed: NaN -> 0, negative overflow -> INT32_MIN,
++      // positive overflow already saturated to INT32_MAX.
++      Label notNaN, done;
++      if (fromType == MIRType::Double) {
++        asMasm().branchDouble(Assembler::DoubleOrdered, input, input, &notNaN);
++      } else {
++        asMasm().branchFloat(Assembler::DoubleOrdered, input, input, &notNaN);
++      }
++      asMasm().move32(Imm32(0), output);
++      asMasm().jump(rejoin);
++
++      asMasm().bind(&notNaN);
++      if (fromType == MIRType::Double) {
++        asMasm().branchDouble(Assembler::DoubleGreaterThanOrEqual, input,
++                              fpscratch, rejoin);
++      } else {
++        asMasm().branchFloat(Assembler::DoubleGreaterThanOrEqual, input,
++                             fpscratch, rejoin);
++      }
++      asMasm().move32(Imm32(INT32_MIN), output);
++    }
++
++    MOZ_ASSERT(rejoin->bound());
++    asMasm().jump(rejoin);
++    return;
++  }
++
++  Label inputIsNaN;
++  if (fromType == MIRType::Double) {
++    asMasm().branchDouble(Assembler::DoubleUnordered, input, input,
++                          &inputIsNaN);
++  } else {
++    asMasm().branchFloat(Assembler::DoubleUnordered, input, input, &inputIsNaN);
++  }
++
++  asMasm().wasmTrap(wasm::Trap::IntegerOverflow, trapSiteDesc);
++  asMasm().bind(&inputIsNaN);
++  asMasm().wasmTrap(wasm::Trap::InvalidConversionToInteger, trapSiteDesc);
++}
++
++void MacroAssemblerPPC64Compat::outOfLineWasmTruncateToInt64Check(
++    FloatRegister input, Register64 output_, MIRType fromType, TruncFlags flags,
++    Label* rejoin, const wasm::TrapSiteDesc& trapSiteDesc) {
++  bool isUnsigned = flags & TRUNC_UNSIGNED;
++  bool isSaturating = flags & TRUNC_SATURATING;
++
++  if (isSaturating) {
++    ScratchDoubleScope fpscratch(asMasm());
++    Register output = output_.reg;
++
++    if (fromType == MIRType::Double) {
++      asMasm().loadConstantDouble(0.0, fpscratch);
++    } else {
++      asMasm().loadConstantFloat32(0.0f, fpscratch);
++    }
++
++    if (isUnsigned) {
++      Label notNegOrNaN;
++      if (fromType == MIRType::Double) {
++        asMasm().branchDouble(Assembler::DoubleGreaterThanOrEqual, input,
++                              fpscratch, &notNegOrNaN);
++      } else {
++        asMasm().branchFloat(Assembler::DoubleGreaterThanOrEqual, input,
++                             fpscratch, &notNegOrNaN);
++      }
++      asMasm().movePtr(ImmWord(0), output);
++      asMasm().jump(rejoin);
++      asMasm().bind(&notNegOrNaN);
++      asMasm().movePtr(ImmWord(UINT64_MAX), output);
++    } else {
++      Label notNaN;
++      if (fromType == MIRType::Double) {
++        asMasm().branchDouble(Assembler::DoubleOrdered, input, input, &notNaN);
++      } else {
++        asMasm().branchFloat(Assembler::DoubleOrdered, input, input, &notNaN);
++      }
++      asMasm().movePtr(ImmWord(0), output);
++      asMasm().jump(rejoin);
++
++      asMasm().bind(&notNaN);
++      if (fromType == MIRType::Double) {
++        asMasm().branchDouble(Assembler::DoubleGreaterThanOrEqual, input,
++                              fpscratch, rejoin);
++      } else {
++        asMasm().branchFloat(Assembler::DoubleGreaterThanOrEqual, input,
++                             fpscratch, rejoin);
++      }
++      asMasm().movePtr(ImmWord(INT64_MIN), output);
++    }
++
++    MOZ_ASSERT(rejoin->bound());
++    asMasm().jump(rejoin);
++    return;
++  }
++
++  Label inputIsNaN;
++  if (fromType == MIRType::Double) {
++    asMasm().branchDouble(Assembler::DoubleUnordered, input, input,
++                          &inputIsNaN);
++  } else {
++    asMasm().branchFloat(Assembler::DoubleUnordered, input, input, &inputIsNaN);
++  }
++
++  asMasm().wasmTrap(wasm::Trap::IntegerOverflow, trapSiteDesc);
++  asMasm().bind(&inputIsNaN);
++  asMasm().wasmTrap(wasm::Trap::InvalidConversionToInteger, trapSiteDesc);
++}
++
++void MacroAssembler::PopStackPtr() {
++  loadPtr(Address(StackPointer, 0), StackPointer);
++  adjustFrame(-int32_t(sizeof(intptr_t)));
++}
++
++void MacroAssembler::patchSub32FromMemAndBranchIfNegative(CodeOffset offset,
++                                                          Imm32 imm) {
++  int32_t val = imm.value;
++  MOZ_RELEASE_ASSERT(val >= 1 && val <= 127);
++  // Patch the addi instruction that's right before patchPoint.
++  // addi is 1 instruction before the CodeOffset (which is after the addi).
++  Instruction* inst =
++      (Instruction*)m_buffer.getInst(BufferOffset(offset.offset() - 4));
++  // Rewrite the immediate field to -val.
++  // PPC addi: opcode(6) | RT(5) | RA(5) | SI(16)
++  uint32_t instWord = inst->encode();
++  uint32_t base = instWord & 0xffff0000;
++  inst->setData(base | (uint16_t)(-val & 0xffff));
++}
++
++void MacroAssembler::wasmTruncateDoubleToInt32(FloatRegister input,
++                                               Register output,
++                                               bool isSaturating,
++                                               Label* oolEntry) {
++  ScratchDoubleScope fpscratch(asMasm());
++  // Clear VXCVI (bit 23) before the conversion so we can detect overflow.
++  as_mtfsb0(23);
++  as_fctiwz(fpscratch, input);
++  as_mfvsrd(output, fpscratch);
++  as_extsw(output, output);
++  // Move FPSCR field 5 (which contains VXCVI) to CR0.
++  // If the conversion was invalid (NaN or out-of-range), VXCVI=1 → SO set.
++  as_mcrfs(cr0, 5);
++  ma_b(SOBit, oolEntry);
++}
++
++void MacroAssembler::wasmTruncateDoubleToUInt32(FloatRegister input,
++                                                Register output,
++                                                bool isSaturating,
++                                                Label* oolEntry) {
++  ScratchDoubleScope fpscratch(asMasm());
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  // Always check for NaN — the ool handler clamps for saturating mode.
++  as_fcmpu(input, input);
++  ma_b(DoubleUnordered, oolEntry);
++  as_fctidz(fpscratch, input);
++  as_mfvsrd(output, fpscratch);
++  x_srdi(scratch, output, 32);
++  as_extsw(output, output);
++  as_cmpdi(scratch, 0);
++  ma_b(NotEqual, oolEntry);
++}
++
++void MacroAssembler::wasmTruncateFloat32ToInt32(FloatRegister input,
++                                                Register output,
++                                                bool isSaturating,
++                                                Label* oolEntry) {
++  ScratchDoubleScope fpscratch(asMasm());
++  as_mtfsb0(23);
++  as_fctiwz(fpscratch, input);
++  as_mfvsrd(output, fpscratch);
++  as_extsw(output, output);
++  as_mcrfs(cr0, 5);
++  ma_b(SOBit, oolEntry);
++}
++
++void MacroAssembler::wasmTruncateFloat32ToUInt32(FloatRegister input,
++                                                 Register output,
++                                                 bool isSaturating,
++                                                 Label* oolEntry) {
++  ScratchDoubleScope fpscratch(asMasm());
++  UseScratchRegisterScope temps(asMasm());
++  Register scratch = temps.Acquire();
++  as_fcmpu(input, input);
++  ma_b(DoubleUnordered, oolEntry);
++  as_fctidz(fpscratch, input);
++  as_mfvsrd(output, fpscratch);
++  x_srdi(scratch, output, 32);
++  as_extsw(output, output);
++  as_cmpdi(scratch, 0);
++  ma_b(NotEqual, oolEntry);
++}
++
++void MacroAssembler::wasmTruncateDoubleToInt64(
++    FloatRegister input, Register64 output, bool isSaturating, Label* oolEntry,
++    Label* oolRejoin, FloatRegister tempDouble) {
++  MOZ_ASSERT(tempDouble.isInvalid());
++  ScratchDoubleScope fpscratch(asMasm());
++  as_mtfsb0(23);
++  as_fctidz(fpscratch, input);
++  as_mfvsrd(output.reg, fpscratch);
++  as_mcrfs(cr0, 5);
++  ma_b(SOBit, oolEntry);
++  if (isSaturating) {
++    bind(oolRejoin);
++  }
++}
++
++void MacroAssembler::wasmTruncateFloat32ToInt64(
++    FloatRegister input, Register64 output, bool isSaturating, Label* oolEntry,
++    Label* oolRejoin, FloatRegister tempFloat) {
++  MOZ_ASSERT(tempFloat.isInvalid());
++  ScratchDoubleScope fpscratch(asMasm());
++  as_mtfsb0(23);
++  as_fctidz(fpscratch, input);
++  as_mfvsrd(output.reg, fpscratch);
++  as_mcrfs(cr0, 5);
++  ma_b(SOBit, oolEntry);
++  if (isSaturating) {
++    bind(oolRejoin);
++  }
++}
++
++void MacroAssembler::wasmTruncateDoubleToUInt64(
++    FloatRegister input, Register64 output, bool isSaturating, Label* oolEntry,
++    Label* oolRejoin, FloatRegister tempDouble) {
++  MOZ_ASSERT(tempDouble.isInvalid());
++  ScratchDoubleScope fpscratch(asMasm());
++  as_mtfsb0(23);
++  as_fctiduz(fpscratch, input);
++  as_mfvsrd(output.reg, fpscratch);
++  as_mcrfs(cr0, 5);
++  ma_b(SOBit, oolEntry);
++  if (isSaturating) {
++    bind(oolRejoin);
++  }
++}
++
++void MacroAssembler::wasmTruncateFloat32ToUInt64(
++    FloatRegister input, Register64 output, bool isSaturating, Label* oolEntry,
++    Label* oolRejoin, FloatRegister tempFloat) {
++  MOZ_ASSERT(tempFloat.isInvalid());
++  ScratchDoubleScope fpscratch(asMasm());
++  as_mtfsb0(23);
++  as_fctiduz(fpscratch, input);
++  as_mfvsrd(output.reg, fpscratch);
++  as_mcrfs(cr0, 5);
++  ma_b(SOBit, oolEntry);
++  if (isSaturating) {
++    bind(oolRejoin);
++  }
++}
++
++void MacroAssemblerPPC64Compat::profilerEnterFrame(Register framePtr,
++                                                   Register scratch) {
++  asMasm().loadJSContext(scratch);
++  loadPtr(Address(scratch, offsetof(JSContext, profilingActivation_)), scratch);
++  storePtr(framePtr,
++           Address(scratch, JitActivation::offsetOfLastProfilingFrame()));
++  storePtr(ImmPtr(nullptr),
++           Address(scratch, JitActivation::offsetOfLastProfilingCallSite()));
++}
++
++void MacroAssemblerPPC64Compat::profilerExitFrame() {
++  jump(asMasm().runtime()->jitRuntime()->getProfilerExitFrameTail());
++}
++
++void MacroAssemblerPPC64Compat::ma_mod_mask(Register src, Register dest,
++                                            Register hold, Register remain,
++                                            int32_t shift, Label* negZero) {
++  // Compute x % ((1<<shift) - 1) by digit-summing in base b = 1<<shift.
++  // Since b % (b-1) == 1, x % (b-1) == sum of base-b digits of x, mod (b-1).
++  int32_t mask = (1 << shift) - 1;
++  Label head, negative, sumSigned, done;
++
++  as_or_(remain, src, src);  // move src -> remain
++  xs_li(dest, 0);
++
++  // Check sign (32-bit signed comparison)
++  as_cmpwi(remain, 0);
++  ma_b(Assembler::LessThan, &negative);
++  xs_li(hold, 1);
++  jump(&head);
++
++  bind(&negative);
++  xs_li(hold, -1);
++  as_neg(remain, remain);
++  as_rldicl(remain, remain, 0, 32);
++
++  bind(&head);
++  {
++    UseScratchRegisterScope temps(asMasm());
++    Register scratch = temps.Acquire();
++
++    // Extract bottom 'shift' bits: scratch = remain & mask
++    move32(Imm32(mask), scratch);
++    as_and_(scratch, remain, scratch);
++
++    // Add to accumulator
++    as_add(dest, dest, scratch);
++
++    // Trial subtraction: scratch = dest - mask
++    move32(Imm32(mask), scratch);
++    as_subf(scratch, scratch, dest);  // scratch = dest - scratch
++
++    // If (dest - mask) > 0, keep the subtracted value
++    as_cmpwi(scratch, 0);
++    ma_b(Assembler::LessThan, &sumSigned);
++    as_or_(dest, scratch, scratch);  // dest = scratch
++    bind(&sumSigned);
++
++    // Shift out the bits we just processed
++    x_srwi(remain, remain, shift);
++
++    // Continue if remain != 0
++    as_cmpwi(remain, 0);
++    ma_b(Assembler::NotEqual, &head);
++  }
++
++  // If input was negative, negate result
++  as_cmpwi(hold, 0);
++  ma_b(Assembler::GreaterThanOrEqual, &done);
++
++  if (negZero != nullptr) {
++    as_cmpwi(dest, 0);
++    ma_b(Assembler::Equal, negZero);
++  }
++
++  as_neg(dest, dest);
++  as_extsw(dest, dest);
++
++  bind(&done);
++}
++
++// ========================================================================
++// Atomic operations.
++
++template <typename T>
++static void CompareExchange(MacroAssembler& masm,
++                            const wasm::MemoryAccessDesc* access,
++                            Scalar::Type type, Synchronization sync,
++                            const T& mem, Register oldval, Register newval,
++                            Register valueTemp, Register offsetTemp,
++                            Register maskTemp, Register output) {
++  UseScratchRegisterScope temps(masm);
++  bool signExtend = Scalar::isSignedIntType(type);
++  unsigned nbytes = Scalar::byteSize(type);
++
++  switch (nbytes) {
++    case 1:
++    case 2:
++      break;
++    case 4:
++      MOZ_ASSERT(valueTemp == InvalidReg);
++      MOZ_ASSERT(offsetTemp == InvalidReg);
++      MOZ_ASSERT(maskTemp == InvalidReg);
++      break;
++    default:
++      MOZ_CRASH();
++  }
++
++  Label again, end;
++
++  Register scratch = temps.Acquire();
++  masm.computeEffectiveAddress(mem, scratch);
++
++  if (nbytes == 4) {
++    masm.memoryBarrierBefore(sync);
++    masm.bind(&again);
++
++    if (access) {
++      masm.flushBuffer();  // see comment in wasmLoadImpl
++      masm.append(*access, wasm::TrapMachineInsn::Atomic,
++                  FaultingCodeOffset(masm.currentOffset()));
++    }
++
++    masm.as_lwarx(output, r0, scratch);
++    // ma_cmp(..., is32bit=true) emits cmpw, which compares only bits
++    // 32:63 (low 32) of both operands per ISA v3.0B. The upper
++    // 32 bits of oldval are ignored, so no canonicalising extsw needed.
++    masm.ma_cmp(output, oldval, Assembler::NotEqual, /* is32bit */ true);
++    masm.ma_b(Assembler::NotEqual, &end);
++    masm.as_stwcx(newval, r0, scratch);
++    masm.ma_b(Assembler::NotEqual, &again);
++
++    masm.memoryBarrierAfter(sync);
++    masm.bind(&end);
++    // lwarx zero-extends; sign-extend for 32-bit canonical form.
++    masm.as_extsw(output, output);
++
++    return;
++  }
++
++  // Sub-word (1 or 2 byte) compare-exchange via native lbarx/lharx +
++  // stbcx./sthcx. POWER7+ (well below our POWER8 baseline). Replaces the prior
++  // round-down-to-word
++  // + mask + RMW dance. lXarx zero-extends the loaded byte/half; stXcx. stores
++  // only the low 8/16 bits of RS, so no pre-masking is needed on the store
++  // side. offsetTemp / maskTemp are still allocated by the lowering but unused
++  // here.
++  (void)offsetTemp;
++  (void)maskTemp;
++
++  masm.memoryBarrierBefore(sync);
++
++  masm.bind(&again);
++
++  if (access) {
++    masm.flushBuffer();  // see comment in wasmLoadImpl
++    masm.append(*access, wasm::TrapMachineInsn::Atomic,
++                FaultingCodeOffset(masm.currentOffset()));
++  }
++
++  switch (nbytes) {
++    case 1:
++      masm.as_lbarx(output, r0, scratch);
++      if (signExtend) {
++        masm.as_extsb(valueTemp, oldval);
++        masm.as_extsb(output, output);
++      } else {
++        masm.as_andi_rc(valueTemp, oldval, 0xff);
++      }
++      break;
++    case 2:
++      masm.as_lharx(output, r0, scratch);
++      if (signExtend) {
++        masm.as_extsh(valueTemp, oldval);
++        masm.as_extsh(output, output);
++      } else {
++        masm.as_rlwinm(valueTemp, oldval, 0, 16, 31);
++      }
++      break;
++  }
++
++  masm.ma_cmp(output, valueTemp, Assembler::NotEqual, /* is32bit */ true);
++  masm.ma_b(Assembler::NotEqual, &end);
++
++  if (nbytes == 1) {
++    masm.as_stbcx(newval, r0, scratch);
++  } else {
++    masm.as_sthcx(newval, r0, scratch);
++  }
++  masm.ma_b(Assembler::NotEqual, &again);
++
++  masm.memoryBarrierAfter(sync);
++
++  masm.bind(&end);
++}
++
++template <typename T>
++static void CompareExchange64(MacroAssembler& masm,
++                              const wasm::MemoryAccessDesc* access,
++                              Synchronization sync, const T& mem,
++                              Register64 expect, Register64 replace,
++                              Register64 output) {
++  MOZ_ASSERT(expect != output && replace != output);
++  UseScratchRegisterScope temps(masm);
++  Register scratch = temps.Acquire();
++  masm.computeEffectiveAddress(mem, scratch);
++
++  Label tryAgain;
++  Label exit;
++
++  masm.memoryBarrierBefore(sync);
++
++  masm.bind(&tryAgain);
++
++  if (access) {
++    masm.flushBuffer();  // see comment in wasmLoadImpl
++    masm.append(*access, wasm::TrapMachineInsn::Atomic,
++                FaultingCodeOffset(masm.currentOffset()));
++  }
++
++  masm.as_ldarx(output.reg, r0, scratch);
++
++  masm.ma_cmp(output.reg, expect.reg, Assembler::NotEqual);
++  masm.ma_b(Assembler::NotEqual, &exit);
++  masm.as_stdcx(replace.reg, r0, scratch);
++  masm.ma_b(Assembler::NotEqual, &tryAgain);
++
++  masm.memoryBarrierAfter(sync);
++
++  masm.bind(&exit);
++}
++
++template <typename T>
++static void AtomicExchange(MacroAssembler& masm,
++                           const wasm::MemoryAccessDesc* access,
++                           Scalar::Type type, Synchronization sync,
++                           const T& mem, Register value, Register valueTemp,
++                           Register offsetTemp, Register maskTemp,
++                           Register output) {
++  UseScratchRegisterScope temps(masm);
++  bool signExtend = Scalar::isSignedIntType(type);
++  unsigned nbytes = Scalar::byteSize(type);
++
++  switch (nbytes) {
++    case 1:
++    case 2:
++      break;
++    case 4:
++      MOZ_ASSERT(valueTemp == InvalidReg);
++      MOZ_ASSERT(offsetTemp == InvalidReg);
++      MOZ_ASSERT(maskTemp == InvalidReg);
++      break;
++    default:
++      MOZ_CRASH();
++  }
++
++  Label again;
++
++  Register memTemp = temps.Acquire();
++  masm.computeEffectiveAddress(mem, memTemp);
++
++  if (nbytes == 4) {
++    masm.memoryBarrierBefore(sync);
++    masm.bind(&again);
++
++    if (access) {
++      masm.flushBuffer();  // see comment in wasmLoadImpl
++      masm.append(*access, wasm::TrapMachineInsn::Atomic,
++                  FaultingCodeOffset(masm.currentOffset()));
++    }
++
++    masm.as_lwarx(output, r0, memTemp);
++    masm.as_stwcx(value, r0, memTemp);
++    masm.ma_b(Assembler::NotEqual, &again);
++
++    masm.memoryBarrierAfter(sync);
++    // lwarx zero-extends; sign-extend for 32-bit canonical form.
++    masm.as_extsw(output, output);
++
++    return;
++  }
++
++  // Sub-word exchange via native lbarx/lharx + stbcx./sthcx. (POWER7+).
++  // valueTemp / offsetTemp / maskTemp are still allocated by the lowering but
++  // unused here.
++  (void)valueTemp;
++  (void)offsetTemp;
++  (void)maskTemp;
++
++  masm.memoryBarrierBefore(sync);
++
++  masm.bind(&again);
++
++  if (access) {
++    masm.flushBuffer();  // see comment in wasmLoadImpl
++    masm.append(*access, wasm::TrapMachineInsn::Atomic,
++                FaultingCodeOffset(masm.currentOffset()));
++  }
++
++  if (nbytes == 1) {
++    masm.as_lbarx(output, r0, memTemp);
++    masm.as_stbcx(value, r0, memTemp);
++  } else {
++    masm.as_lharx(output, r0, memTemp);
++    masm.as_sthcx(value, r0, memTemp);
++  }
++  masm.ma_b(Assembler::NotEqual, &again);
++
++  if (signExtend) {
++    if (nbytes == 1) {
++      masm.as_extsb(output, output);
++    } else {
++      masm.as_extsh(output, output);
++    }
++  }
++  // Unsigned: lbarx/lharx already zero-extend; output is canonical.
++
++  masm.memoryBarrierAfter(sync);
++}
++
++template <typename T>
++static void AtomicExchange64(MacroAssembler& masm,
++                             const wasm::MemoryAccessDesc* access,
++                             Synchronization sync, const T& mem,
++                             Register64 value, Register64 output) {
++  MOZ_ASSERT(value != output);
++  UseScratchRegisterScope temps(masm);
++
++  Register scratch = temps.Acquire();
++  masm.computeEffectiveAddress(mem, scratch);
++
++  Label tryAgain;
++
++  masm.memoryBarrierBefore(sync);
++
++  masm.bind(&tryAgain);
++
++  if (access) {
++    masm.flushBuffer();  // see comment in wasmLoadImpl
++    masm.append(*access, wasm::TrapMachineInsn::Atomic,
++                FaultingCodeOffset(masm.currentOffset()));
++  }
++
++  masm.as_ldarx(output.reg, r0, scratch);
++
++  masm.as_stdcx(value.reg, r0, scratch);
++  masm.ma_b(Assembler::NotEqual, &tryAgain);
++
++  masm.memoryBarrierAfter(sync);
++}
++
++template <typename T>
++static void AtomicFetchOp(MacroAssembler& masm,
++                          const wasm::MemoryAccessDesc* access,
++                          Scalar::Type type, Synchronization sync, AtomicOp op,
++                          const T& mem, Register value, Register valueTemp,
++                          Register offsetTemp, Register maskTemp,
++                          Register output) {
++  UseScratchRegisterScope temps(masm);
++  bool signExtend = Scalar::isSignedIntType(type);
++  unsigned nbytes = Scalar::byteSize(type);
++
++  switch (nbytes) {
++    case 1:
++    case 2:
++      break;
++    case 4:
++      MOZ_ASSERT(valueTemp == InvalidReg);
++      MOZ_ASSERT(offsetTemp == InvalidReg);
++      MOZ_ASSERT(maskTemp == InvalidReg);
++      break;
++    default:
++      MOZ_CRASH();
++  }
++
++  Label again;
++
++  Register memTemp = temps.Acquire();
++  masm.computeEffectiveAddress(mem, memTemp);
++
++  Register scratch = temps.Acquire();
++
++  if (nbytes == 4) {
++    masm.memoryBarrierBefore(sync);
++    masm.bind(&again);
++
++    if (access) {
++      masm.flushBuffer();  // see comment in wasmLoadImpl
++      masm.append(*access, wasm::TrapMachineInsn::Atomic,
++                  FaultingCodeOffset(masm.currentOffset()));
++    }
++
++    masm.as_lwarx(output, r0, memTemp);
++
++    switch (op) {
++      case AtomicOp::Add:
++        masm.as_add(scratch, output, value);
++        break;
++      case AtomicOp::Sub:
++        masm.as_subf(scratch, value, output);
++        break;
++      case AtomicOp::And:
++        masm.as_and_(scratch, output, value);
++        break;
++      case AtomicOp::Or:
++        masm.as_or_(scratch, output, value);
++        break;
++      case AtomicOp::Xor:
++        masm.as_xor_(scratch, output, value);
++        break;
++      default:
++        MOZ_CRASH();
++    }
++
++    masm.as_stwcx(scratch, r0, memTemp);
++    masm.ma_b(Assembler::NotEqual, &again);
++
++    masm.memoryBarrierAfter(sync);
++    // lwarx zero-extends; sign-extend for 32-bit canonical form.
++    masm.as_extsw(output, output);
++
++    return;
++  }
++
++  // Sub-word fetch-and-op via native lbarx/lharx + stbcx./sthcx. (POWER7+).
++  // `output` holds the pre-op loaded value (returned to caller); `valueTemp`
++  // is the post-op value we condition-store. stXcx. only stores low 8/16 bits
++  // of RS, so no pre-mask of valueTemp is needed.
++  // offsetTemp / maskTemp are still allocated by the lowering but unused; the
++  // local `scratch` is only used in the 4-byte branch above.
++  (void)offsetTemp;
++  (void)maskTemp;
++
++  masm.memoryBarrierBefore(sync);
++
++  masm.bind(&again);
++
++  if (access) {
++    masm.flushBuffer();  // see comment in wasmLoadImpl
++    masm.append(*access, wasm::TrapMachineInsn::Atomic,
++                FaultingCodeOffset(masm.currentOffset()));
++  }
++
++  if (nbytes == 1) {
++    masm.as_lbarx(output, r0, memTemp);
++  } else {
++    masm.as_lharx(output, r0, memTemp);
++  }
++
++  switch (op) {
++    case AtomicOp::Add:
++      masm.as_add(valueTemp, output, value);
++      break;
++    case AtomicOp::Sub:
++      masm.as_subf(valueTemp, value, output);
++      break;
++    case AtomicOp::And:
++      masm.as_and_(valueTemp, output, value);
++      break;
++    case AtomicOp::Or:
++      masm.as_or_(valueTemp, output, value);
++      break;
++    case AtomicOp::Xor:
++      masm.as_xor_(valueTemp, output, value);
++      break;
++    default:
++      MOZ_CRASH();
++  }
++
++  if (nbytes == 1) {
++    masm.as_stbcx(valueTemp, r0, memTemp);
++  } else {
++    masm.as_sthcx(valueTemp, r0, memTemp);
++  }
++  masm.ma_b(Assembler::NotEqual, &again);
++
++  if (signExtend) {
++    if (nbytes == 1) {
++      masm.as_extsb(output, output);
++    } else {
++      masm.as_extsh(output, output);
++    }
++  }
++  // Unsigned: lbarx/lharx already zero-extend; output is canonical.
++
++  masm.memoryBarrierAfter(sync);
++}
++
++template <typename T>
++static void AtomicFetchOp64(MacroAssembler& masm,
++                            const wasm::MemoryAccessDesc* access,
++                            Synchronization sync, AtomicOp op, Register64 value,
++                            const T& mem, Register64 temp, Register64 output) {
++  MOZ_ASSERT(value != output);
++  MOZ_ASSERT(value != temp);
++  UseScratchRegisterScope temps(masm);
++  Register scratch = temps.Acquire();
++  masm.computeEffectiveAddress(mem, scratch);
++
++  Label tryAgain;
++
++  masm.memoryBarrierBefore(sync);
++
++  masm.bind(&tryAgain);
++
++  if (access) {
++    masm.flushBuffer();  // see comment in wasmLoadImpl
++    masm.append(*access, wasm::TrapMachineInsn::Atomic,
++                FaultingCodeOffset(masm.currentOffset()));
++  }
++
++  masm.as_ldarx(output.reg, r0, scratch);
++
++  switch (op) {
++    case AtomicOp::Add:
++      masm.as_add(temp.reg, output.reg, value.reg);
++      break;
++    case AtomicOp::Sub:
++      masm.as_subf(temp.reg, value.reg, output.reg);
++      break;
++    case AtomicOp::And:
++      masm.as_and_(temp.reg, output.reg, value.reg);
++      break;
++    case AtomicOp::Or:
++      masm.as_or_(temp.reg, output.reg, value.reg);
++      break;
++    case AtomicOp::Xor:
++      masm.as_xor_(temp.reg, output.reg, value.reg);
++      break;
++    default:
++      MOZ_CRASH();
++  }
++
++  masm.as_stdcx(temp.reg, r0, scratch);
++  masm.ma_b(Assembler::NotEqual, &tryAgain);
++
++  masm.memoryBarrierAfter(sync);
++}
++
++template <typename T>
++static void AtomicEffectOp(MacroAssembler& masm,
++                           const wasm::MemoryAccessDesc* access,
++                           Scalar::Type type, Synchronization sync, AtomicOp op,
++                           const T& mem, Register value, Register valueTemp,
++                           Register offsetTemp, Register maskTemp) {
++  UseScratchRegisterScope temps(masm);
++  unsigned nbytes = Scalar::byteSize(type);
++
++  switch (nbytes) {
++    case 1:
++    case 2:
++      break;
++    case 4:
++      MOZ_ASSERT(valueTemp == InvalidReg);
++      MOZ_ASSERT(offsetTemp == InvalidReg);
++      MOZ_ASSERT(maskTemp == InvalidReg);
++      break;
++    default:
++      MOZ_CRASH();
++  }
++
++  Label again;
++
++  Register scratch = temps.Acquire();
++  masm.computeEffectiveAddress(mem, scratch);
++
++  Register scratch2 = temps.Acquire();
++
++  if (nbytes == 4) {
++    masm.memoryBarrierBefore(sync);
++    masm.bind(&again);
++
++    if (access) {
++      masm.flushBuffer();  // see comment in wasmLoadImpl
++      masm.append(*access, wasm::TrapMachineInsn::Atomic,
++                  FaultingCodeOffset(masm.currentOffset()));
++    }
++
++    masm.as_lwarx(scratch2, r0, scratch);
++
++    switch (op) {
++      case AtomicOp::Add:
++        masm.as_add(scratch2, scratch2, value);
++        break;
++      case AtomicOp::Sub:
++        masm.as_subf(scratch2, value, scratch2);
++        break;
++      case AtomicOp::And:
++        masm.as_and_(scratch2, scratch2, value);
++        break;
++      case AtomicOp::Or:
++        masm.as_or_(scratch2, scratch2, value);
++        break;
++      case AtomicOp::Xor:
++        masm.as_xor_(scratch2, scratch2, value);
++        break;
++      default:
++        MOZ_CRASH();
++    }
++
++    masm.as_stwcx(scratch2, r0, scratch);
++    masm.ma_b(Assembler::NotEqual, &again);
++
++    masm.memoryBarrierAfter(sync);
++
++    return;
++  }
++
++  // Sub-word effect-only op via native lbarx/lharx + stbcx./sthcx. (POWER7+).
++  // No output to return; scratch2 holds the load+op+store value.
++  // valueTemp / offsetTemp / maskTemp are still allocated by the lowering but
++  // unused here.
++  (void)valueTemp;
++  (void)offsetTemp;
++  (void)maskTemp;
++
++  masm.memoryBarrierBefore(sync);
++
++  masm.bind(&again);
++
++  if (access) {
++    masm.flushBuffer();  // see comment in wasmLoadImpl
++    masm.append(*access, wasm::TrapMachineInsn::Atomic,
++                FaultingCodeOffset(masm.currentOffset()));
++  }
++
++  if (nbytes == 1) {
++    masm.as_lbarx(scratch2, r0, scratch);
++  } else {
++    masm.as_lharx(scratch2, r0, scratch);
++  }
++
++  switch (op) {
++    case AtomicOp::Add:
++      masm.as_add(scratch2, scratch2, value);
++      break;
++    case AtomicOp::Sub:
++      masm.as_subf(scratch2, value, scratch2);
++      break;
++    case AtomicOp::And:
++      masm.as_and_(scratch2, scratch2, value);
++      break;
++    case AtomicOp::Or:
++      masm.as_or_(scratch2, scratch2, value);
++      break;
++    case AtomicOp::Xor:
++      masm.as_xor_(scratch2, scratch2, value);
++      break;
++    default:
++      MOZ_CRASH();
++  }
++
++  if (nbytes == 1) {
++    masm.as_stbcx(scratch2, r0, scratch);
++  } else {
++    masm.as_sthcx(scratch2, r0, scratch);
++  }
++  masm.ma_b(Assembler::NotEqual, &again);
++
++  masm.memoryBarrierAfter(sync);
++}
++
++// Public MacroAssembler methods.
++
++void MacroAssembler::compareExchange(Scalar::Type type, Synchronization sync,
++                                     const Address& mem, Register oldval,
++                                     Register newval, Register valueTemp,
++                                     Register offsetTemp, Register maskTemp,
++                                     Register output) {
++  CompareExchange(*this, nullptr, type, sync, mem, oldval, newval, valueTemp,
++                  offsetTemp, maskTemp, output);
++}
++
++void MacroAssembler::compareExchange(Scalar::Type type, Synchronization sync,
++                                     const BaseIndex& mem, Register oldval,
++                                     Register newval, Register valueTemp,
++                                     Register offsetTemp, Register maskTemp,
++                                     Register output) {
++  CompareExchange(*this, nullptr, type, sync, mem, oldval, newval, valueTemp,
++                  offsetTemp, maskTemp, output);
++}
++
++void MacroAssembler::compareExchange64(Synchronization sync, const Address& mem,
++                                       Register64 expect, Register64 replace,
++                                       Register64 output) {
++  CompareExchange64(*this, nullptr, sync, mem, expect, replace, output);
++}
++
++void MacroAssembler::compareExchange64(Synchronization sync,
++                                       const BaseIndex& mem, Register64 expect,
++                                       Register64 replace, Register64 output) {
++  CompareExchange64(*this, nullptr, sync, mem, expect, replace, output);
++}
++
++void MacroAssembler::wasmCompareExchange(const wasm::MemoryAccessDesc& access,
++                                         const Address& mem, Register oldval,
++                                         Register newval, Register valueTemp,
++                                         Register offsetTemp, Register maskTemp,
++                                         Register output) {
++  CompareExchange(*this, &access, access.type(), access.sync(), mem, oldval,
++                  newval, valueTemp, offsetTemp, maskTemp, output);
++}
++
++void MacroAssembler::wasmCompareExchange(const wasm::MemoryAccessDesc& access,
++                                         const BaseIndex& mem, Register oldval,
++                                         Register newval, Register valueTemp,
++                                         Register offsetTemp, Register maskTemp,
++                                         Register output) {
++  CompareExchange(*this, &access, access.type(), access.sync(), mem, oldval,
++                  newval, valueTemp, offsetTemp, maskTemp, output);
++}
++
++void MacroAssembler::wasmCompareExchange64(const wasm::MemoryAccessDesc& access,
++                                           const Address& mem,
++                                           Register64 expect,
++                                           Register64 replace,
++                                           Register64 output) {
++  CompareExchange64(*this, &access, access.sync(), mem, expect, replace,
++                    output);
++}
++
++void MacroAssembler::wasmCompareExchange64(const wasm::MemoryAccessDesc& access,
++                                           const BaseIndex& mem,
++                                           Register64 expect,
++                                           Register64 replace,
++                                           Register64 output) {
++  CompareExchange64(*this, &access, access.sync(), mem, expect, replace,
++                    output);
++}
++
++void MacroAssembler::atomicExchange(Scalar::Type type, Synchronization sync,
++                                    const Address& mem, Register value,
++                                    Register valueTemp, Register offsetTemp,
++                                    Register maskTemp, Register output) {
++  AtomicExchange(*this, nullptr, type, sync, mem, value, valueTemp, offsetTemp,
++                 maskTemp, output);
++}
++
++void MacroAssembler::atomicExchange(Scalar::Type type, Synchronization sync,
++                                    const BaseIndex& mem, Register value,
++                                    Register valueTemp, Register offsetTemp,
++                                    Register maskTemp, Register output) {
++  AtomicExchange(*this, nullptr, type, sync, mem, value, valueTemp, offsetTemp,
++                 maskTemp, output);
++}
++
++void MacroAssembler::atomicExchange64(Synchronization sync, const Address& mem,
++                                      Register64 value, Register64 output) {
++  AtomicExchange64(*this, nullptr, sync, mem, value, output);
++}
++
++void MacroAssembler::atomicExchange64(Synchronization sync,
++                                      const BaseIndex& mem, Register64 value,
++                                      Register64 output) {
++  AtomicExchange64(*this, nullptr, sync, mem, value, output);
++}
++
++void MacroAssembler::wasmAtomicExchange(const wasm::MemoryAccessDesc& access,
++                                        const Address& mem, Register value,
++                                        Register valueTemp, Register offsetTemp,
++                                        Register maskTemp, Register output) {
++  AtomicExchange(*this, &access, access.type(), access.sync(), mem, value,
++                 valueTemp, offsetTemp, maskTemp, output);
++}
++
++void MacroAssembler::wasmAtomicExchange(const wasm::MemoryAccessDesc& access,
++                                        const BaseIndex& mem, Register value,
++                                        Register valueTemp, Register offsetTemp,
++                                        Register maskTemp, Register output) {
++  AtomicExchange(*this, &access, access.type(), access.sync(), mem, value,
++                 valueTemp, offsetTemp, maskTemp, output);
++}
++
++template <typename T>
++static void WasmAtomicExchange64(MacroAssembler& masm,
++                                 const wasm::MemoryAccessDesc& access,
++                                 const T& mem, Register64 value,
++                                 Register64 output) {
++  AtomicExchange64(masm, &access, access.sync(), mem, value, output);
++}
++
++void MacroAssembler::wasmAtomicExchange64(const wasm::MemoryAccessDesc& access,
++                                          const Address& mem, Register64 src,
++                                          Register64 output) {
++  WasmAtomicExchange64(*this, access, mem, src, output);
++}
++
++void MacroAssembler::wasmAtomicExchange64(const wasm::MemoryAccessDesc& access,
++                                          const BaseIndex& mem, Register64 src,
++                                          Register64 output) {
++  WasmAtomicExchange64(*this, access, mem, src, output);
++}
++
++void MacroAssembler::atomicFetchOp(Scalar::Type type, Synchronization sync,
++                                   AtomicOp op, Register value,
++                                   const Address& mem, Register valueTemp,
++                                   Register offsetTemp, Register maskTemp,
++                                   Register output) {
++  AtomicFetchOp(*this, nullptr, type, sync, op, mem, value, valueTemp,
++                offsetTemp, maskTemp, output);
++}
++
++void MacroAssembler::atomicFetchOp(Scalar::Type type, Synchronization sync,
++                                   AtomicOp op, Register value,
++                                   const BaseIndex& mem, Register valueTemp,
++                                   Register offsetTemp, Register maskTemp,
++                                   Register output) {
++  AtomicFetchOp(*this, nullptr, type, sync, op, mem, value, valueTemp,
++                offsetTemp, maskTemp, output);
++}
++
++void MacroAssembler::atomicFetchOp64(Synchronization sync, AtomicOp op,
++                                     Register64 value, const Address& mem,
++                                     Register64 temp, Register64 output) {
++  AtomicFetchOp64(*this, nullptr, sync, op, value, mem, temp, output);
++}
++
++void MacroAssembler::atomicFetchOp64(Synchronization sync, AtomicOp op,
++                                     Register64 value, const BaseIndex& mem,
++                                     Register64 temp, Register64 output) {
++  AtomicFetchOp64(*this, nullptr, sync, op, value, mem, temp, output);
++}
++
++void MacroAssembler::atomicEffectOp64(Synchronization sync, AtomicOp op,
++                                      Register64 value, const Address& mem,
++                                      Register64 temp) {
++  AtomicFetchOp64(*this, nullptr, sync, op, value, mem, temp, temp);
++}
++
++void MacroAssembler::atomicEffectOp64(Synchronization sync, AtomicOp op,
++                                      Register64 value, const BaseIndex& mem,
++                                      Register64 temp) {
++  AtomicFetchOp64(*this, nullptr, sync, op, value, mem, temp, temp);
++}
++
++void MacroAssembler::wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access,
++                                       AtomicOp op, Register value,
++                                       const Address& mem, Register valueTemp,
++                                       Register offsetTemp, Register maskTemp,
++                                       Register output) {
++  AtomicFetchOp(*this, &access, access.type(), access.sync(), op, mem, value,
++                valueTemp, offsetTemp, maskTemp, output);
++}
++
++void MacroAssembler::wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access,
++                                       AtomicOp op, Register value,
++                                       const BaseIndex& mem, Register valueTemp,
++                                       Register offsetTemp, Register maskTemp,
++                                       Register output) {
++  AtomicFetchOp(*this, &access, access.type(), access.sync(), op, mem, value,
++                valueTemp, offsetTemp, maskTemp, output);
++}
++
++void MacroAssembler::wasmAtomicFetchOp64(const wasm::MemoryAccessDesc& access,
++                                         AtomicOp op, Register64 value,
++                                         const Address& mem, Register64 temp,
++                                         Register64 output) {
++  AtomicFetchOp64(*this, &access, access.sync(), op, value, mem, temp, output);
++}
++
++void MacroAssembler::wasmAtomicFetchOp64(const wasm::MemoryAccessDesc& access,
++                                         AtomicOp op, Register64 value,
++                                         const BaseIndex& mem, Register64 temp,
++                                         Register64 output) {
++  AtomicFetchOp64(*this, &access, access.sync(), op, value, mem, temp, output);
++}
++
++void MacroAssembler::wasmAtomicEffectOp(const wasm::MemoryAccessDesc& access,
++                                        AtomicOp op, Register value,
++                                        const Address& mem, Register valueTemp,
++                                        Register offsetTemp,
++                                        Register maskTemp) {
++  AtomicEffectOp(*this, &access, access.type(), access.sync(), op, mem, value,
++                 valueTemp, offsetTemp, maskTemp);
++}
++
++void MacroAssembler::wasmAtomicEffectOp(const wasm::MemoryAccessDesc& access,
++                                        AtomicOp op, Register value,
++                                        const BaseIndex& mem,
++                                        Register valueTemp, Register offsetTemp,
++                                        Register maskTemp) {
++  AtomicEffectOp(*this, &access, access.type(), access.sync(), op, mem, value,
++                 valueTemp, offsetTemp, maskTemp);
++}
++
++// ========================================================================
++// JS atomic operations.
++
++template <typename T>
++static void CompareExchangeJS(MacroAssembler& masm, Scalar::Type arrayType,
++                              Synchronization sync, const T& mem,
++                              Register oldval, Register newval,
++                              Register valueTemp, Register offsetTemp,
++                              Register maskTemp, Register temp,
++                              AnyRegister output) {
++  if (arrayType == Scalar::Uint32) {
++    masm.compareExchange(arrayType, sync, mem, oldval, newval, valueTemp,
++                         offsetTemp, maskTemp, temp);
++    masm.convertUInt32ToDouble(temp, output.fpu());
++  } else {
++    masm.compareExchange(arrayType, sync, mem, oldval, newval, valueTemp,
++                         offsetTemp, maskTemp, output.gpr());
++  }
++}
++
++template <typename T>
++static void AtomicExchangeJS(MacroAssembler& masm, Scalar::Type arrayType,
++                             Synchronization sync, const T& mem, Register value,
++                             Register valueTemp, Register offsetTemp,
++                             Register maskTemp, Register temp,
++                             AnyRegister output) {
++  if (arrayType == Scalar::Uint32) {
++    masm.atomicExchange(arrayType, sync, mem, value, valueTemp, offsetTemp,
++                        maskTemp, temp);
++    masm.convertUInt32ToDouble(temp, output.fpu());
++  } else {
++    masm.atomicExchange(arrayType, sync, mem, value, valueTemp, offsetTemp,
++                        maskTemp, output.gpr());
++  }
++}
++
++template <typename T>
++static void AtomicFetchOpJS(MacroAssembler& masm, Scalar::Type arrayType,
++                            Synchronization sync, AtomicOp op, Register value,
++                            const T& mem, Register valueTemp,
++                            Register offsetTemp, Register maskTemp,
++                            Register temp, AnyRegister output) {
++  if (arrayType == Scalar::Uint32) {
++    masm.atomicFetchOp(arrayType, sync, op, value, mem, valueTemp, offsetTemp,
++                       maskTemp, temp);
++    masm.convertUInt32ToDouble(temp, output.fpu());
++  } else {
++    masm.atomicFetchOp(arrayType, sync, op, value, mem, valueTemp, offsetTemp,
++                       maskTemp, output.gpr());
++  }
++}
++
++void MacroAssembler::compareExchangeJS(Scalar::Type arrayType,
++                                       Synchronization sync, const Address& mem,
++                                       Register oldval, Register newval,
++                                       Register valueTemp, Register offsetTemp,
++                                       Register maskTemp, Register temp,
++                                       AnyRegister output) {
++  CompareExchangeJS(*this, arrayType, sync, mem, oldval, newval, valueTemp,
++                    offsetTemp, maskTemp, temp, output);
++}
++
++void MacroAssembler::compareExchangeJS(Scalar::Type arrayType,
++                                       Synchronization sync,
++                                       const BaseIndex& mem, Register oldval,
++                                       Register newval, Register valueTemp,
++                                       Register offsetTemp, Register maskTemp,
++                                       Register temp, AnyRegister output) {
++  CompareExchangeJS(*this, arrayType, sync, mem, oldval, newval, valueTemp,
++                    offsetTemp, maskTemp, temp, output);
++}
++
++void MacroAssembler::atomicExchangeJS(Scalar::Type arrayType,
++                                      Synchronization sync, const Address& mem,
++                                      Register value, Register valueTemp,
++                                      Register offsetTemp, Register maskTemp,
++                                      Register temp, AnyRegister output) {
++  AtomicExchangeJS(*this, arrayType, sync, mem, value, valueTemp, offsetTemp,
++                   maskTemp, temp, output);
++}
++
++void MacroAssembler::atomicExchangeJS(Scalar::Type arrayType,
++                                      Synchronization sync,
++                                      const BaseIndex& mem, Register value,
++                                      Register valueTemp, Register offsetTemp,
++                                      Register maskTemp, Register temp,
++                                      AnyRegister output) {
++  AtomicExchangeJS(*this, arrayType, sync, mem, value, valueTemp, offsetTemp,
++                   maskTemp, temp, output);
++}
++
++void MacroAssembler::atomicFetchOpJS(Scalar::Type arrayType,
++                                     Synchronization sync, AtomicOp op,
++                                     Register value, const Address& mem,
++                                     Register valueTemp, Register offsetTemp,
++                                     Register maskTemp, Register temp,
++                                     AnyRegister output) {
++  AtomicFetchOpJS(*this, arrayType, sync, op, value, mem, valueTemp, offsetTemp,
++                  maskTemp, temp, output);
++}
++
++void MacroAssembler::atomicFetchOpJS(Scalar::Type arrayType,
++                                     Synchronization sync, AtomicOp op,
++                                     Register value, const BaseIndex& mem,
++                                     Register valueTemp, Register offsetTemp,
++                                     Register maskTemp, Register temp,
++                                     AnyRegister output) {
++  AtomicFetchOpJS(*this, arrayType, sync, op, value, mem, valueTemp, offsetTemp,
++                  maskTemp, temp, output);
++}
++
++void MacroAssembler::atomicEffectOpJS(Scalar::Type arrayType,
++                                      Synchronization sync, AtomicOp op,
++                                      Register value, const BaseIndex& mem,
++                                      Register valueTemp, Register offsetTemp,
++                                      Register maskTemp) {
++  AtomicEffectOp(*this, nullptr, arrayType, sync, op, mem, value, valueTemp,
++                 offsetTemp, maskTemp);
++}
++
++void MacroAssembler::atomicEffectOpJS(Scalar::Type arrayType,
++                                      Synchronization sync, AtomicOp op,
++                                      Register value, const Address& mem,
++                                      Register valueTemp, Register offsetTemp,
++                                      Register maskTemp) {
++  AtomicEffectOp(*this, nullptr, arrayType, sync, op, mem, value, valueTemp,
++                 offsetTemp, maskTemp);
++}
++
++// ========================================================================
++// Wasm address offset carry tests.
++
++void MacroAssemblerPPC64Compat::ma_add32TestCarry(Condition cond, Register rd,
++                                                  Register rs, Imm32 imm,
++                                                  Label* overflow) {
++  MOZ_ASSERT(cond == Assembler::CarrySet || cond == Assembler::CarryClear);
++  if (rd != rs) {
++    asMasm().move32(rs, rd);
++    asMasm().add32(imm, rd);
++    as_cmplw(rd, rs);
++  } else {
++    // visitWasmAddOffset uses useRegisterAtStart, so the LIR allocator may
++    // collapse rd onto rs. move32 + add32 would clobber rs before the
++    // compare; save rs to a scratch first.
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    asMasm().move32(rs, scratch);
++    asMasm().add32(imm, rd);
++    as_cmplw(rd, scratch);
++  }
++  ma_b(cond == Assembler::CarrySet ? LessThan : GreaterThanOrEqual, overflow);
++}
++
++void MacroAssemblerPPC64Compat::ma_addPtrTestCarry(Condition cond, Register rd,
++                                                   Register rs, ImmWord imm,
++                                                   Label* overflow) {
++  MOZ_ASSERT(cond == Assembler::CarrySet || cond == Assembler::CarryClear);
++  if (rd != rs) {
++    asMasm().movePtr(rs, rd);
++    asMasm().addPtr(ImmWord(imm.value), rd);
++    as_cmpld(rd, rs);
++  } else {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    asMasm().movePtr(rs, scratch);
++    asMasm().addPtr(ImmWord(imm.value), rd);
++    as_cmpld(rd, scratch);
++  }
++  ma_b(cond == Assembler::CarrySet ? LessThan : GreaterThanOrEqual, overflow);
++}
++
++// ========================================================================
++// Wasm load/store helpers.
++
++void MacroAssemblerPPC64Compat::wasmProbeLastByte(
++    const wasm::MemoryAccessDesc& access, Register memoryBase, Register ptr) {
++  if (HasPOWER9()) {
++    return;
++  }
++  const unsigned size = Scalar::byteSize(access.type());
++  if (size <= 1) {
++    return;
++  }
++  UseScratchRegisterScope temps(asMasm());
++  Register probeAddr = temps.Acquire();
++  // size is at most 16 (Simd128), well within the int16_t range of as_addi.
++  as_addi(probeAddr, ptr, int16_t(size - 1));
++  // Record the probe as a wasm trap site so its SIGSEGV dispatches
++  // through the wasm signal handler the same way the real access would.
++  m_buffer.flushPool();
++  append(access, wasm::TrapMachineInsn::Load8,
++         FaultingCodeOffset(currentOffset()));
++  // Probing 1-byte load; result discarded.
++  as_lbzx(probeAddr, memoryBase, probeAddr);
++}
++
++void MacroAssemblerPPC64Compat::wasmLoadImpl(
++    const wasm::MemoryAccessDesc& access, Register memoryBase, Register ptr,
++    Register ptrScratch, AnyRegister output) {
++  access.assertOffsetInGuardPages();
++  uint32_t offset = access.offset32();
++  MOZ_ASSERT_IF(offset, ptrScratch != InvalidReg);
++
++  if (offset) {
++    asMasm().addPtr(ImmWord(offset), ptrScratch);
++    ptr = ptrScratch;
++  }
++
++  wasmProbeLastByte(access, memoryBase, ptr);
++
++  asMasm().memoryBarrierBefore(access.sync());
++  // Flush any pending constant pool entries before recording the trap site,
++  // otherwise a pool body inserted between the recorded offset and the
++  // emitted load shifts the load and leaves the pool guard branch at the
++  // recorded offset (SummarizeTrapInstruction then rejects the trap site).
++  m_buffer.flushPool();
++  append(access, wasm::TrapMachineInsnForLoad(Scalar::byteSize(access.type())),
++         FaultingCodeOffset(currentOffset()));
++
++  switch (access.type()) {
++    case Scalar::Int8:
++      as_lbzx(output.gpr(), memoryBase, ptr);
++      as_extsb(output.gpr(), output.gpr());
++      break;
++    case Scalar::Uint8:
++      as_lbzx(output.gpr(), memoryBase, ptr);
++      break;
++    case Scalar::Int16:
++      as_lhax(output.gpr(), memoryBase, ptr);
++      break;
++    case Scalar::Uint16:
++      as_lhzx(output.gpr(), memoryBase, ptr);
++      break;
++    case Scalar::Int32:
++    case Scalar::Uint32:
++      as_lwzx(output.gpr(), memoryBase, ptr);
++      as_extsw(output.gpr(), output.gpr());
++      break;
++    case Scalar::Float64:
++      if (access.isZeroExtendSimd128Load() || access.isSplatSimd128Load() ||
++          access.isWidenSimd128Load()) {
++        // lfdx is X-form scalar FP — encodes only 5-bit FRT, so a
++        // Simd128 dest (encoding 32+) corrupts the opcode. Bridge
++        // through ScratchDoubleReg (FPR f0, encoding 0).
++        ScratchDoubleScope dscratch(asMasm());
++        as_lfdx(dscratch, memoryBase, ptr);
++        if (access.isZeroExtendSimd128Load()) {
++          // Loaded value goes to BE dw1 (= LE dw0 = lane 0); BE dw0 = 0.
++          as_xxlxor(ScratchSimd128Reg, ScratchSimd128Reg, ScratchSimd128Reg);
++          as_xxpermdi(output.fpu(), ScratchSimd128Reg, dscratch, 0);
++        } else if (access.isSplatSimd128Load()) {
++          as_xxpermdi(output.fpu(), dscratch, dscratch, 0);
++        } else {
++          // widen: place loaded 64 bits in LE dw0 (= BE dw1) for widenLow.
++          as_xxpermdi(output.fpu(), dscratch, dscratch, 2);
++          switch (access.widenSimdOp()) {
++            case wasm::SimdOp::V128Load8x8S:
++              asMasm().widenLowInt8x16(output.fpu(), output.fpu());
++              break;
++            case wasm::SimdOp::V128Load8x8U:
++              asMasm().unsignedWidenLowInt8x16(output.fpu(), output.fpu());
++              break;
++            case wasm::SimdOp::V128Load16x4S:
++              asMasm().widenLowInt16x8(output.fpu(), output.fpu());
++              break;
++            case wasm::SimdOp::V128Load16x4U:
++              asMasm().unsignedWidenLowInt16x8(output.fpu(), output.fpu());
++              break;
++            case wasm::SimdOp::V128Load32x2S:
++              asMasm().widenLowInt32x4(output.fpu(), output.fpu());
++              break;
++            case wasm::SimdOp::V128Load32x2U:
++              asMasm().unsignedWidenLowInt32x4(output.fpu(), output.fpu());
++              break;
++            default:
++              MOZ_CRASH("Unexpected widen op");
++          }
++        }
++      } else {
++        as_lfdx(output.fpu(), memoryBase, ptr);
++      }
++      break;
++    case Scalar::Float32:
++      if (access.isZeroExtendSimd128Load()) {
++        // v128.load32_zero: load 32 raw bits into lane 0, zero the rest.
++        UseScratchRegisterScope temps(asMasm());
++        Register tmp = temps.Acquire();
++        as_lwzx(tmp, memoryBase, ptr);
++        as_xxlxor(output.fpu(), output.fpu(), output.fpu());
++        if (HasPOWER9()) {
++          as_mtvsrws(ScratchSimd128Reg, tmp);
++          as_xxinsertw(output.fpu(), ScratchSimd128Reg, 12);
++        } else {
++          // POWER8: mtvsrd puts value in BE dw0 low 32 bits.
++          // xxpermdi(dest, zero, scratch, 0) = {zero[dw0], scratch[dw0]}
++          // in BE, placing the value in LE word 0 with the rest zero.
++          as_mtvsrd(ScratchSimd128Reg, tmp);
++          as_xxpermdi(output.fpu(), output.fpu(), ScratchSimd128Reg, 0);
++        }
++      } else {
++        as_lfsx(output.fpu(), memoryBase, ptr);
++      }
++      break;
++    case Scalar::Simd128:
++      if (HasPOWER9()) {
++        as_lxvx(output.fpu(), memoryBase, ptr);
++      } else {
++        as_lxvd2x(output.fpu(), memoryBase, ptr);
++        as_xxpermdi(output.fpu(), output.fpu(), output.fpu(), 2);
++      }
++      break;
++    default:
++      MOZ_CRASH("unexpected array type");
++  }
++
++  asMasm().memoryBarrierAfter(access.sync());
++}
++
++void MacroAssemblerPPC64Compat::wasmStoreImpl(
++    const wasm::MemoryAccessDesc& access, AnyRegister value,
++    Register memoryBase, Register ptr, Register ptrScratch) {
++  access.assertOffsetInGuardPages();
++  uint32_t offset = access.offset32();
++  MOZ_ASSERT_IF(offset, ptrScratch != InvalidReg);
++
++  if (offset) {
++    asMasm().addPtr(ImmWord(offset), ptrScratch);
++    ptr = ptrScratch;
++  }
++
++  wasmProbeLastByte(access, memoryBase, ptr);
++
++  asMasm().memoryBarrierBefore(access.sync());
++  // Record trap site at the faulting memory instruction. For P8 Simd128
++  // store, the faulting instruction (stxvd2x) is after a byte-swap
++  // (xxpermdi), so we defer the trap site recording.
++  // Flush pool first; see comment in wasmLoadImpl.
++  if (access.type() != Scalar::Simd128 || HasPOWER9()) {
++    m_buffer.flushPool();
++    append(access,
++           wasm::TrapMachineInsnForStore(Scalar::byteSize(access.type())),
++           FaultingCodeOffset(currentOffset()));
++  }
++
++  switch (access.type()) {
++    case Scalar::Int8:
++    case Scalar::Uint8:
++      as_stbx(value.gpr(), memoryBase, ptr);
++      break;
++    case Scalar::Int16:
++    case Scalar::Uint16:
++      as_sthx(value.gpr(), memoryBase, ptr);
++      break;
++    case Scalar::Int32:
++    case Scalar::Uint32:
++      as_stwx(value.gpr(), memoryBase, ptr);
++      break;
++    case Scalar::Int64:
++      as_stdx(value.gpr(), memoryBase, ptr);
++      break;
++    case Scalar::Float64:
++      as_stfdx(value.fpu(), memoryBase, ptr);
++      break;
++    case Scalar::Float32:
++      as_stfsx(value.fpu(), memoryBase, ptr);
++      break;
++    case Scalar::Simd128:
++      if (HasPOWER9()) {
++        as_stxvx(value.fpu(), memoryBase, ptr);
++      } else {
++        as_xxpermdi(ScratchSimd128Reg, value.fpu(), value.fpu(), 2);
++        m_buffer.flushPool();  // see comment in wasmLoadImpl
++        append(access,
++               wasm::TrapMachineInsnForStore(Scalar::byteSize(access.type())),
++               FaultingCodeOffset(currentOffset()));
++        as_stxvd2x(ScratchSimd128Reg, memoryBase, ptr);
++      }
++      break;
++    default:
++      MOZ_CRASH("unexpected array type");
++  }
++
++  asMasm().memoryBarrierAfter(access.sync());
++}
++
++void MacroAssemblerPPC64Compat::wasmLoadI64Impl(
++    const wasm::MemoryAccessDesc& access, Register memoryBase, Register ptr,
++    Register ptrScratch, Register64 output) {
++  uint32_t offset = access.offset32();
++  MOZ_ASSERT_IF(offset, ptrScratch != InvalidReg);
++
++  if (offset) {
++    asMasm().addPtr(ImmWord(offset), ptrScratch);
++    ptr = ptrScratch;
++  }
++
++  wasmProbeLastByte(access, memoryBase, ptr);
++
++  asMasm().memoryBarrierBefore(access.sync());
++  m_buffer.flushPool();  // see comment in wasmLoadImpl
++  append(access, wasm::TrapMachineInsnForLoad(Scalar::byteSize(access.type())),
++         FaultingCodeOffset(currentOffset()));
++
++  switch (access.type()) {
++    case Scalar::Int8:
++      as_lbzx(output.reg, memoryBase, ptr);
++      as_extsb(output.reg, output.reg);
++      break;
++    case Scalar::Uint8:
++      as_lbzx(output.reg, memoryBase, ptr);
++      break;
++    case Scalar::Int16:
++      as_lhax(output.reg, memoryBase, ptr);
++      break;
++    case Scalar::Uint16:
++      as_lhzx(output.reg, memoryBase, ptr);
++      break;
++    case Scalar::Int32:
++      as_lwzx(output.reg, memoryBase, ptr);
++      as_extsw(output.reg, output.reg);
++      break;
++    case Scalar::Uint32:
++      as_lwzx(output.reg, memoryBase, ptr);
++      // Zero-extended by lwzx already
++      break;
++    case Scalar::Int64:
++      as_ldx(output.reg, memoryBase, ptr);
++      break;
++    default:
++      MOZ_CRASH("unexpected array type");
++  }
++
++  asMasm().memoryBarrierAfter(access.sync());
++}
++
++void MacroAssemblerPPC64Compat::wasmStoreI64Impl(
++    const wasm::MemoryAccessDesc& access, Register64 value, Register memoryBase,
++    Register ptr, Register ptrScratch) {
++  uint32_t offset = access.offset32();
++  MOZ_ASSERT_IF(offset, ptrScratch != InvalidReg);
++
++  if (offset) {
++    asMasm().addPtr(ImmWord(offset), ptrScratch);
++    ptr = ptrScratch;
++  }
++
++  wasmProbeLastByte(access, memoryBase, ptr);
++
++  asMasm().memoryBarrierBefore(access.sync());
++  m_buffer.flushPool();  // see comment in wasmLoadImpl
++  append(access, wasm::TrapMachineInsnForStore(Scalar::byteSize(access.type())),
++         FaultingCodeOffset(currentOffset()));
++
++  switch (access.type()) {
++    case Scalar::Int8:
++    case Scalar::Uint8:
++      as_stbx(value.reg, memoryBase, ptr);
++      break;
++    case Scalar::Int16:
++    case Scalar::Uint16:
++      as_sthx(value.reg, memoryBase, ptr);
++      break;
++    case Scalar::Int32:
++    case Scalar::Uint32:
++      as_stwx(value.reg, memoryBase, ptr);
++      break;
++    case Scalar::Int64:
++      as_stdx(value.reg, memoryBase, ptr);
++      break;
++    default:
++      MOZ_CRASH("unexpected array type");
++  }
++
++  asMasm().memoryBarrierAfter(access.sync());
++}
++
++void MacroAssembler::wasmLoad(const wasm::MemoryAccessDesc& access,
++                              Register memoryBase, Register ptr,
++                              Register ptrScratch, AnyRegister output) {
++  wasmLoadImpl(access, memoryBase, ptr, ptrScratch, output);
++}
++
++void MacroAssembler::wasmLoadI64(const wasm::MemoryAccessDesc& access,
++                                 Register memoryBase, Register ptr,
++                                 Register ptrScratch, Register64 output) {
++  wasmLoadI64Impl(access, memoryBase, ptr, ptrScratch, output);
++}
++
++void MacroAssembler::wasmStore(const wasm::MemoryAccessDesc& access,
++                               AnyRegister value, Register memoryBase,
++                               Register ptr, Register ptrScratch) {
++  wasmStoreImpl(access, value, memoryBase, ptr, ptrScratch);
++}
++
++void MacroAssembler::wasmStoreI64(const wasm::MemoryAccessDesc& access,
++                                  Register64 value, Register memoryBase,
++                                  Register ptr, Register ptrScratch) {
++  wasmStoreI64Impl(access, value, memoryBase, ptr, ptrScratch);
++}
++
++//}}} check_macroassembler_style
++
++}  // namespace jit
++}  // namespace js
++
++#ifdef ENABLE_WASM_SIMD
++// static
++bool MacroAssembler::MustMaskShiftCountSimd128(wasm::SimdOp op, int32_t* mask) {
++  return false;
++}
++#endif
+diff --git a/js/src/jit/ppc64/MacroAssembler-ppc64.h b/js/src/jit/ppc64/MacroAssembler-ppc64.h
+new file mode 100644
+index 000000000000..bc2143b67465
+--- /dev/null
++++ b/js/src/jit/ppc64/MacroAssembler-ppc64.h
+@@ -0,0 +1,2031 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_MacroAssembler_ppc64_h
++#define jit_ppc64_MacroAssembler_ppc64_h
++
++#include "jit/MoveResolver.h"
++#include "jit/ppc64/Assembler-ppc64.h"
++#include "wasm/WasmBuiltins.h"
++
++namespace js {
++namespace jit {
++
++inline bool is_intN(int64_t x, unsigned n) {
++  MOZ_ASSERT((0 < n) && (n < 64));
++  int64_t limit = static_cast<int64_t>(1) << (n - 1);
++  return (-limit <= x) && (x < limit);
++}
++
++inline bool is_uintN(uint64_t x, unsigned n) {
++  MOZ_ASSERT((0 < n) && (n < 64));
++  return !(x >> n);
++}
++
++// enterNoPool() guard sizes. Inhibiting the constant pool keeps these
++// stanzas at a fixed instruction count, which patchers and long-branch
++// resolvers rely on. Each constant names a distinct stanza shape; see
++// the emitting call site for the exact layout.
++//
++// kNoPoolLoad64StanzaInsns (8): emitLoad64Stanza body — 8 NOPs that
++//   WriteLoad64Instructions later overwrites in place. Two shapes share
++//   the same 8-slot footprint with the .quad fixed at slots [6..7]:
++//     - POWER9+ (HasPOWER9()): addpcis + ld + b + 3 NOPs (2 dynamic insns,
++//       no LR clobber). Preferred path.
++//     - POWER8 fallback: mflr/bcl/mflr/mtlr/ld/b LR-bouncing sequence
++//       (6 dynamic insns, RAS-thrashing — kept only because P8 has no
++//       addpcis).
++//
++// kNoPoolPatchableBranchInsns (10): patchable far call / jump /
++//   unconditional branch. Three alternative shapes, all fitting the
++//   same budget:
++//     - load64 stanza (8) + mtctr + bctr[l]  = 10  (bound call/jump)
++//     - 9 NOPs + bl                          = 10  (short bound call)
++//     - xs_trap_tagged(TAG) + chain + 8 NOPs = 10  (fwd-ref stanza)
++//
++// kNoPoolCondLongBranchInsnsP8Max (14): conditional long branch, POWER8
++//   Overflow worst case. POWER8 has no mcrxrx so overflow/carry test is
++//   mfxer+rlwinm+mtcrf (3 insns) on top of the base shape. Budget =
++//   3 (XER inspection) + 1 (bc) + 8 (load64 stanza) + 2 (mtctr+bctr) = 14.
++static constexpr size_t kNoPoolLoad64StanzaInsns = 8;
++static constexpr size_t kNoPoolPatchableBranchInsns = 10;
++static constexpr size_t kNoPoolCondLongBranchInsnsP8Max = 14;
++
++enum LoadStoreSize {
++  SizeByte = 8,
++  SizeHalfWord = 16,
++  SizeWord = 32,
++  SizeDouble = 64
++};
++
++enum LoadStoreExtension { ZeroExtend = 0, SignExtend = 1 };
++
++static Register CallReg = r12;
++
++struct ImmShiftedTag : public ImmWord {
++  explicit ImmShiftedTag(JSValueShiftedTag shtag) : ImmWord((uintptr_t)shtag) {}
++  explicit ImmShiftedTag(JSValueType type)
++      : ImmWord(((uintptr_t)JSVAL_TYPE_TO_SHIFTED_TAG(type))) {}
++};
++
++struct ImmTag : public Imm32 {
++  explicit ImmTag(JSValueTag tag) : Imm32(tag) {}
++};
++
++class ScratchTagScope {
++  UseScratchRegisterScope temps_;
++  Register scratch_;
++  bool owned_;
++  mozilla::DebugOnly<bool> released_;
++
++ public:
++  ScratchTagScope(Assembler& masm, const ValueOperand&)
++      : temps_(masm), owned_(true), released_(false) {
++    scratch_ = temps_.Acquire();
++  }
++
++  operator Register() {
++    MOZ_ASSERT(!released_);
++    return scratch_;
++  }
++
++  void release() {
++    MOZ_ASSERT(!released_);
++    released_ = true;
++    if (owned_) {
++      temps_.Release(scratch_);
++      owned_ = false;
++    }
++  }
++
++  void reacquire() {
++    MOZ_ASSERT(released_);
++    released_ = false;
++    if (!owned_) {
++      scratch_ = temps_.Acquire();
++      owned_ = true;
++    }
++  }
++};
++
++class ScratchTagScopeRelease {
++  ScratchTagScope* ts_;
++
++ public:
++  explicit ScratchTagScopeRelease(ScratchTagScope* ts) : ts_(ts) {
++    ts_->release();
++  }
++  ~ScratchTagScopeRelease() { ts_->reacquire(); }
++};
++
++class MacroAssemblerPPC64 : public Assembler {
++ protected:
++  MacroAssembler& asMasm();
++  const MacroAssembler& asMasm() const;
++};
++
++class MacroAssemblerPPC64Compat : public MacroAssemblerPPC64 {
++ public:
++  using MacroAssemblerPPC64::MacroAssemblerPPC64;
++
++  MacroAssemblerPPC64Compat() {}
++
++  bool buildOOLFakeExitFrame(void* fakeReturnAddr);
++
++  // ===============================================================
++  // Conversion functions
++
++  void convertBoolToInt32(Register src, Register dest) {
++    as_rlwinm(dest, src, 0, 31, 31);
++  }
++  void convertInt32ToDouble(Register src, FloatRegister dest) {
++    // mtvsrwa: VSR[dest].dw0 = sign_ext_64(src[32:63]); P8+ (ISA 2.07).
++    // Replaces extsw + mtvsrd (2 insns + scratch GPR) with 1 insn.
++    as_mtvsrwa(dest, src);
++    as_fcfid(dest, dest);
++  }
++  void convertInt32ToDouble(const Address& src, FloatRegister dest) {
++    // lfiwax (P7+): FPR.dw[0] = sign_ext_64(MEM[addr, 4]). X-form indexed
++    // — no immediate offset, so when offset != 0 we add it into a scratch
++    // first. Replaces lwz + extsw + mtvsrd with lfiwax (one insn) plus
++    // optional address add.
++    if (src.offset == 0) {
++      as_lfiwax(dest, r0, src.base);
++    } else {
++      UseScratchRegisterScope temps(*this);
++      Register scratch = temps.Acquire();
++      if (is_intN(src.offset, 16)) {
++        as_addi(scratch, src.base, src.offset);
++        as_lfiwax(dest, r0, scratch);
++      } else {
++        // X-form indexed: lfiwax computes base + scratch directly, no add.
++        movePtr(ImmWord(src.offset), scratch);
++        as_lfiwax(dest, src.base, scratch);
++      }
++    }
++    as_fcfid(dest, dest);
++  }
++  void convertInt32ToDouble(const BaseIndex& src, FloatRegister dest) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    computeScaledAddress(src, scratch);
++    convertInt32ToDouble(Address(scratch, src.offset), dest);
++  }
++  void convertUInt32ToDouble(Register src, FloatRegister dest);
++  void convertUInt32ToFloat32(Register src, FloatRegister dest);
++  void convertDoubleToFloat32(FloatRegister src, FloatRegister dest) {
++    as_frsp(dest, src);
++  }
++  // POWER9 FP16 conversions (1 insn each). Caller must have verified
++  // HasPOWER9() — SupportsFloat{64,32}To16 gates that. PPC64 FPRs hold
++  // doubles internally; an "FP32-in-FPR" is just the FP32 value stored
++  // as exact FP64, so xscvdphp/xscvhpdp work for both FP32↔FP16 and
++  // FP64↔FP16 (FP16 fits exactly in FP32 which fits exactly in FP64).
++  void convertDoubleToFloat16(FloatRegister src, FloatRegister dest) {
++    MOZ_ASSERT(HasPOWER9());
++    as_xscvdphp(dest, src);
++  }
++  void convertFloat16ToDouble(FloatRegister src, FloatRegister dest) {
++    MOZ_ASSERT(HasPOWER9());
++    as_xscvhpdp(dest, src);
++  }
++  void convertFloat32ToFloat16(FloatRegister src, FloatRegister dest) {
++    MOZ_ASSERT(HasPOWER9());
++    as_xscvdphp(dest, src);
++  }
++  void convertFloat16ToFloat32(FloatRegister src, FloatRegister dest) {
++    MOZ_ASSERT(HasPOWER9());
++    as_xscvhpdp(dest, src);
++  }
++  void convertInt32ToFloat16(Register src, FloatRegister dest) {
++    MOZ_ASSERT(HasPOWER9());
++    convertInt32ToFloat32(src, dest);
++    convertFloat32ToFloat16(dest, dest);
++  }
++  void convertDoubleToInt32(FloatRegister src, Register dest, Label* fail,
++                            bool negativeZeroCheck = true);
++  void convertDoubleToPtr(FloatRegister src, Register dest, Label* fail,
++                          bool negativeZeroCheck = true);
++  void convertFloat32ToInt32(FloatRegister src, Register dest, Label* fail,
++                             bool negativeZeroCheck = true);
++  void convertFloat32ToDouble(FloatRegister src, FloatRegister dest) {
++    // PPC64 FPRs hold every FP32 value in its FP64-equivalent representation,
++    // so f64.promote_f32 is conceptually a no-op except that wasm requires
++    // sNaN inputs to be quieted. frsp (Round to Single-Precision) is the
++    // identity for SP-representable inputs but applies IEEE NaN-quieting as
++    // a side effect, replacing the prior fmr + fcmpu + branch + canonical-
++    // NaN-load (5+ insns + scratch GPR) with a single instruction. Result
++    // matches what x86 vcvtss2sd / ARM fcvt produce.
++    as_frsp(dest, src);
++  }
++  void convertInt32ToFloat32(Register src, FloatRegister dest) {
++    // mtvsrwa + fcfids; same recipe as convertInt32ToDouble(Register).
++    as_mtvsrwa(dest, src);
++    as_fcfids(dest, dest);
++  }
++  void convertInt32ToFloat32(const Address& src, FloatRegister dest) {
++    // lfiwax + fcfids; same recipe as convertInt32ToDouble(Address).
++    if (src.offset == 0) {
++      as_lfiwax(dest, r0, src.base);
++    } else {
++      UseScratchRegisterScope temps(*this);
++      Register scratch = temps.Acquire();
++      if (is_intN(src.offset, 16)) {
++        as_addi(scratch, src.base, src.offset);
++        as_lfiwax(dest, r0, scratch);
++      } else {
++        movePtr(ImmWord(src.offset), scratch);
++        as_lfiwax(dest, src.base, scratch);
++      }
++    }
++    as_fcfids(dest, dest);
++  }
++
++  // POWER9 FP16 load: lxsihzx writes the 2 memory bytes directly into
++  // dw[0] low 16 bits with the rest zeroed — matching the layout that
++  // xscvhpdp expects, in a single instruction.
++  FaultingCodeOffset loadFloat16(const Address& addr, FloatRegister dest,
++                                 Register temp) {
++    MOZ_ASSERT(HasPOWER9());
++    if (addr.offset == 0) {
++      return FaultingCodeOffset(as_lxsihzx(dest, r0, addr.base).getOffset());
++    }
++    if (is_intN(addr.offset, 16)) {
++      as_addi(temp, addr.base, addr.offset);
++      return FaultingCodeOffset(as_lxsihzx(dest, r0, temp).getOffset());
++    }
++    movePtr(ImmWord(addr.offset), temp);
++    return FaultingCodeOffset(as_lxsihzx(dest, addr.base, temp).getOffset());
++  }
++  FaultingCodeOffset loadFloat16(const BaseIndex& src, FloatRegister dest,
++                                 Register temp) {
++    MOZ_ASSERT(HasPOWER9());
++    computeEffectiveAddress(src, temp);
++    return FaultingCodeOffset(as_lxsihzx(dest, r0, temp).getOffset());
++  }
++
++  // ===============================================================
++  // Effective address computation
++
++  void computeScaledAddress(const BaseIndex& address, Register dest) {
++    if (address.scale == TimesOne) {
++      as_add(dest, address.base, address.index);
++    } else if (dest != address.base && dest != address.index) {
++      x_sldi(dest, address.index, address.scale);
++      as_add(dest, address.base, dest);
++    } else {
++      UseScratchRegisterScope temps(*this);
++      Register scratch = temps.Acquire();
++      x_sldi(scratch, address.index, address.scale);
++      as_add(dest, address.base, scratch);
++    }
++  }
++
++  void computeEffectiveAddress(const Address& address, Register dest) {
++    if (address.offset == 0) {
++      if (dest != address.base) {
++        xs_mr(dest, address.base);
++      }
++    } else if (is_intN(address.offset, 16)) {
++      as_addi(dest, address.base, address.offset);
++    } else if (HasPOWER10() && is_intN(address.offset, 34)) {
++      // Single-insn 34-bit-signed reg+imm add. Avoids the scratch GPR.
++      as_paddi(dest, address.base, address.offset, /*R=*/false);
++    } else {
++      UseScratchRegisterScope temps(*this);
++      Register scratch = temps.Acquire();
++      MOZ_ASSERT(scratch != dest);
++      movePtr(ImmWord(address.offset), scratch);
++      as_add(dest, address.base, scratch);
++    }
++  }
++  void computeEffectiveAddress(const BaseIndex& address, Register dest) {
++    computeScaledAddress(address, dest);
++    if (address.offset) {
++      if (is_intN(address.offset, 16)) {
++        as_addi(dest, dest, address.offset);
++      } else if (HasPOWER10() && is_intN(address.offset, 34)) {
++        as_paddi(dest, dest, address.offset, /*R=*/false);
++      } else {
++        UseScratchRegisterScope temps(*this);
++        Register scratch = temps.Acquire();
++        MOZ_ASSERT(scratch != dest);
++        movePtr(ImmWord(address.offset), scratch);
++        as_add(dest, dest, scratch);
++      }
++    }
++  }
++
++  // ===============================================================
++  // Move instructions
++
++  void mov(Register src, Register dest) { xs_mr(dest, src); }
++  void mov(ImmWord imm, Register dest) { movePtr(imm, dest); }
++  void mov(ImmPtr imm, Register dest) {
++    mov(ImmWord(uintptr_t(imm.value)), dest);
++  }
++  // Emit an 8-instruction NOP stanza for a patchable 64-bit load.
++  // Pool flushes are inhibited during emission to prevent pool data
++  // from being inserted mid-stanza.
++  BufferOffset emitLoad64Stanza(Register dest, uint64_t value) {
++    m_buffer.enterNoPool(kNoPoolLoad64StanzaInsns);
++    BufferOffset bo = writeInst(NopInst);
++    writeInst(NopInst);
++    writeInst(NopInst);
++    writeInst(NopInst);
++    writeInst(NopInst);
++    writeInst(NopInst);
++    writeInst(NopInst);
++    writeInst(NopInst);
++    m_buffer.leaveNoPool();
++    // If any of the 8 writeInst calls hit OOM, only some of the stanza
++    // was reserved in the buffer. WriteLoad64Instructions writes 32 bytes
++    // unconditionally, so calling it here would overflow the Vector's
++    // backing store and corrupt the next heap chunk's metadata, surfacing
++    // later as a malloc-detected free-time crash.
++    if (m_buffer.oom()) {
++      return bo;
++    }
++    WriteLoad64Instructions((Instruction*)editSrc(bo), dest, value);
++    return bo;
++  }
++
++  void mov(CodeLabel* label, Register dest) {
++    BufferOffset bo = emitLoad64Stanza(dest, LabelBase::INVALID_OFFSET);
++    label->patchAt()->bind(bo.getOffset());
++    label->setLinkMode(CodeLabel::MoveImmediate);
++  }
++  void mov(Register src, Address dest) { storePtr(src, dest); }
++  void mov(Address src, Register dest) { loadPtr(src, dest); }
++
++  void move32(Imm32 imm, Register dest) {
++    if (is_intN(imm.value, 16)) {
++      xs_li(dest, (int16_t)imm.value);
++    } else if (is_uintN((uint32_t)imm.value, 16)) {
++      xs_li(dest, 0);
++      as_ori(dest, dest, (uint16_t)imm.value);
++    } else {
++      xs_lis(dest, (int16_t)((uint32_t)imm.value >> 16));
++      if (imm.value & 0xffff) {
++        as_ori(dest, dest, (uint16_t)imm.value);
++      }
++    }
++  }
++  void move32(Register src, Register dest) { as_extsw(dest, src); }
++
++  void movePtr(Register src, Register dest) {
++    if (src != dest) {
++      xs_mr(dest, src);
++    }
++  }
++  void movePtr(ImmWord imm, Register dest) {
++    if (imm.value == 0) {
++      xs_li(dest, 0);
++    } else if (is_intN((intptr_t)imm.value, 16)) {
++      xs_li(dest, (int16_t)imm.value);
++    } else if (is_uintN(imm.value, 16)) {
++      xs_li(dest, 0);
++      as_ori(dest, dest, (uint16_t)imm.value);
++    } else if (is_intN((intptr_t)imm.value, 32)) {
++      // 32-bit signed: lis + ori (2 instructions).
++      xs_lis(dest, (int16_t)((uint32_t)imm.value >> 16));
++      if (imm.value & 0xFFFF) {
++        as_ori(dest, dest, (uint16_t)imm.value);
++      }
++    } else if (HasPOWER10() && is_intN((intptr_t)imm.value, 34)) {
++      // POWER10 single-instruction 34-bit signed immediate. Replaces the
++      // 5-insn fallback for values in (33-34)-bit signed range.
++      // 8 bytes vs 20 bytes; one slot temp register is no longer needed.
++      as_paddi(dest, r0, (int64_t)imm.value, /*R=*/false);
++    } else {
++      // Full 64-bit: GCC-style lis+ori+lis+ori+rldimi (5 instructions).
++      // No LR clobber, no embedded data — pure instruction sequence.
++      uint32_t lo32 = (uint32_t)(imm.value);
++      uint32_t hi32 = (uint32_t)(imm.value >> 32);
++      Register temp = (dest != SecondScratchReg) ? SecondScratchReg
++                                                 : SavedScratchRegister;
++      m_buffer.ensureSpace(5 * sizeof(uint32_t));
++      xs_lis(dest, (int16_t)(lo32 >> 16));
++      as_ori(dest, dest, lo32 & 0xFFFF);
++      xs_lis(temp, (int16_t)(hi32 >> 16));
++      as_ori(temp, temp, hi32 & 0xFFFF);
++      as_rldimi(dest, temp, 32, 0);
++    }
++  }
++  void movePtr(ImmPtr imm, Register dest) {
++    movePtr(ImmWord(uintptr_t(imm.value)), dest);
++  }
++
++  // Load a 64-bit FPR constant from the inline constant pool.
++  // POWER9: 2 instructions (addpcis + lfd) -- no alignment constraint.
++  // POWER10: 1 prefixed instruction (plfd, 2 slots), or 3 slots in the
++  //   (loadAddr & 63) == 60 alignment-leading-nop case. Reserve 3 to
++  //   cover both cases conservatively.
++  // POWER8: not used -- loadConstantDouble inlines the constant.
++  BufferOffset loadFromPoolFloat64(FloatRegister dest, double value) {
++    size_t slots = HasPOWER10() ? 3 : 2;
++    uint32_t hint = (uint32_t(dest.encoding()) << 16) |
++                    (uint32_t(PoolLoadFPR64) << 21) | 0xF0000000;
++    uint32_t inst[3] = {hint, NopInst, NopInst};
++    return m_buffer.allocEntry(slots, 2, (uint8_t*)inst, (uint8_t*)&value);
++  }
++  // Load a 32-bit FPR constant from the inline constant pool.
++  // Same shape as loadFromPoolFloat64 (above). lfs/plfs auto-expand the
++  // 32-bit single-precision value to double in the FPR, so no follow-up
++  // xscvspdpn is needed.
++  BufferOffset loadFromPoolFloat32(FloatRegister dest, float value) {
++    size_t slots = HasPOWER10() ? 3 : 2;
++    uint32_t hint = (uint32_t(dest.encoding()) << 16) |
++                    (uint32_t(PoolLoadFPR32) << 21) | 0xF0000000;
++    uint32_t inst[3] = {hint, NopInst, NopInst};
++    return m_buffer.allocEntry(slots, 1, (uint8_t*)inst, (uint8_t*)&value);
++  }
++  // Load a 128-bit SIMD constant from the inline constant pool.
++  // Per-arch slot reservation -- the patcher writes only the slots
++  // each micro-arch actually needs:
++  //   P8: 5 (bcl + mflr + addi + lxvd2x + xxpermdi)
++  //   P9: 3 (addpcis + addi + lxvx) -- no LR touch, no RAS hazard
++  //   P10: 3 (alignment-safe: prefix + suffix + 1 reserve for the
++  //          (loadAddr & 63) == 60 leading-nop case)
++  // Pool entry is 4 × 4-byte words = 16 bytes. P9 uses
++  // SavedScratchRegister (r16) as the PC base; P10 emits a single
++  // PC-relative plxv with no scratch and no LR touch. Only P8 still
++  // clobbers LR (correctness-only fallback; live by design).
++  BufferOffset loadFromPoolSimd128(FloatRegister dest,
++                                   const SimdConstant& v) {
++    size_t slots;
++    if (HasPOWER10()) {
++      slots = 3;
++    } else if (HasPOWER9()) {
++      slots = 3;
++    } else {
++      slots = 5;
++    }
++    // Simd128 encoding is 32-63; mask to 5 bits for hint.
++    // PatchConstantPoolLoad sets TX bit unconditionally for Simd128.
++    uint32_t hint = ((uint32_t(dest.encoding()) & 0x1F) << 16) |
++                    (uint32_t(PoolLoadSimd128) << 21) | 0xF0000000;
++    uint32_t inst[5] = {hint, NopInst, NopInst, NopInst, NopInst};
++    return m_buffer.allocEntry(slots, 4, (uint8_t*)inst, (uint8_t*)v.bytes());
++  }
++  void movePtr(wasm::SymbolicAddress imm, Register dest) {
++    BufferOffset bo = emitLoad64Stanza(dest, (uint64_t)-1);
++    append(wasm::SymbolicAccess(CodeOffset(bo.getOffset()), imm));
++  }
++  void movePtr(ImmGCPtr imm, Register dest) {
++    BufferOffset bo = emitLoad64Stanza(dest,
++                                       (uint64_t)uintptr_t(imm.value));
++    Assembler::writeDataRelocation(bo, imm);
++  }
++
++  void moveFloat32(FloatRegister src, FloatRegister dest) {
++    if (src != dest) {
++      as_fmr(dest, src);
++    }
++  }
++  void moveDouble(FloatRegister src, FloatRegister dest) {
++    if (src != dest) {
++      as_fmr(dest, src);
++    }
++  }
++
++  // ===============================================================
++  // Branch functions
++
++  void branch(JitCode* c) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    BufferOffset bo = emitLoad64Stanza(scratch, (uint64_t)uintptr_t(c->raw()));
++    addPendingJump(bo, ImmPtr(c->raw()), RelocationKind::JITCODE);
++    xs_mtctr(scratch);
++    as_bctr();
++  }
++  void branch(const Register reg) {
++    xs_mtctr(reg);
++    as_bctr();
++  }
++
++  void jump(Label* label) {
++    if (label->bound()) {
++      // Open the no-pool window BEFORE computing the displacement. The
++      // enterNoPool() call itself can trigger a pool flush, which advances
++      // currentOffset(). Computing the displacement against the pre-flush
++      // offset and then emitting the b at the post-flush offset would land
++      // the branch (poolSize) bytes past the intended target.
++      m_buffer.enterNoPool(2);
++      int32_t offset = label->offset() - currentOffset();
++      if (JOffImm26::IsInRange(offset)) {
++        as_b(offset);
++        writeInst(NopInst);
++        m_buffer.leaveNoPool();
++        return;
++      }
++      m_buffer.leaveNoPool();
++      // Long jump to bound label.
++      BufferOffset bo =
++          emitLoad64Stanza(SecondScratchReg, LabelBase::INVALID_OFFSET);
++      xs_mtctr(SecondScratchReg);
++      as_bctr();
++      addLongJump(bo, BufferOffset(label->offset()));
++      return;
++    }
++    // Unbound label: emit trap-tagged stanza (10 slots).
++    m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
++    BufferOffset bo = xs_trap_tagged(BTag);
++    writeInst(label->used() ? label->offset() : LabelBase::INVALID_OFFSET);
++    writeInst(NopInst);
++    writeInst(NopInst);
++    writeInst(NopInst);
++    writeInst(NopInst);
++    writeInst(NopInst);
++    writeInst(NopInst);
++    writeInst(NopInst);
++    writeInst(NopInst);
++    m_buffer.leaveNoPool();
++    if (!oom()) {
++      label->use(bo.getOffset());
++    }
++  }
++  void jump(Register reg) {
++    xs_mtctr(reg);
++    as_bctr();
++  }
++  void jump(const Address& address) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    loadPtr(address, scratch);
++    xs_mtctr(scratch);
++    as_bctr();
++  }
++  void jump(JitCode* code) { branch(code); }
++  void jump(ImmPtr ptr) {
++    BufferOffset bo =
++        emitLoad64Stanza(SecondScratchReg, (uint64_t)uintptr_t(ptr.value));
++    addPendingJump(bo, ptr, RelocationKind::HARDCODED);
++    xs_mtctr(SecondScratchReg);
++    as_bctr();
++  }
++  void jump(TrampolinePtr code) { jump(ImmPtr(code.value)); }
++
++  // Conditional branch to label. Assumes a compare instruction has already
++  // been emitted that sets CR0.
++  template <typename CondT>
++  void ma_b(CondT cond, Label* label) {
++    if constexpr (std::is_same_v<CondT, Condition>) {
++      if (cond == Always) {
++        jump(label);
++        return;
++      }
++    }
++    if (label->bound()) {
++      // Open the no-pool window BEFORE computing the displacement. Same
++      // hazard as jump(): enterNoPool may itself flush a pending pool,
++      // advancing currentOffset(); the bc must emit with a displacement
++      // computed against the post-flush offset. Budget covers max 6
++      // instructions: POWER8 Overflow XER ops (3) + cror (1) + bc (1) +
++      // nop (1) for the worst-case DoubleCondition+Overflow short path.
++      m_buffer.enterNoPool(6);
++      // For DoubleCondition, as_bc emits cror/crandc before the bc
++      // instruction, advancing currentOffset() by 4. Account for this
++      // in the offset calculation.
++      int32_t crAdjust = 0;
++      if constexpr (std::is_same_v<CondT, DoubleCondition>) {
++        crAdjust = -(int32_t)sizeof(uint32_t);
++      }
++      int32_t offset = label->offset() - currentOffset() + crAdjust;
++      if (BOffImm16::IsInRange(offset)) {
++        as_bc((int16_t)offset, cond);
++        writeInst(NopInst);
++        m_buffer.leaveNoPool();
++        return;
++      }
++      m_buffer.leaveNoPool();
++      // Long conditional branch for bound label.
++      // XER ops(0-3) + cror(0-1) + bc(1) + stanza(8) + mtctr(1) + bctr(1).
++      // P8 Overflow: mfxer+rlwinm+mtcrf+bc+stanza+mtctr+bctr = 14 max.
++      m_buffer.enterNoPool(kNoPoolCondLongBranchInsnsP8Max);
++      as_bc((int16_t)44, InvertCondition(cond));
++      BufferOffset boLoad =
++          emitLoad64Stanza(SecondScratchReg, LabelBase::INVALID_OFFSET);
++      xs_mtctr(SecondScratchReg);
++      as_bctr();
++      m_buffer.leaveNoPool();
++      addLongJump(boLoad, BufferOffset(label->offset()));
++      return;
++    }
++    // Forward reference: emit BCTag stanza.
++    // XER ops(0-3) + cror(0-1) + bc(1) + trap_tagged(1) + chain(1) + 8 NOPs.
++    // P8 Overflow: mfxer+rlwinm+mtcrf+bc+trap+chain+8NOPs = 14 max.
++    m_buffer.enterNoPool(kNoPoolCondLongBranchInsnsP8Max);
++    as_bc((int16_t)44, InvertCondition(cond));
++    BufferOffset bo = xs_trap_tagged(BCTag);
++    writeInst(label->used() ? label->offset() : LabelBase::INVALID_OFFSET);
++    writeInst(NopInst);
++    writeInst(NopInst);
++    writeInst(NopInst);
++    writeInst(NopInst);
++    writeInst(NopInst);
++    writeInst(NopInst);
++    writeInst(NopInst);
++    writeInst(NopInst);
++    m_buffer.leaveNoPool();
++    if (!oom()) {
++      label->use(bo.getOffset());
++    }
++  }
++
++  // Set dest = 1 if CR0 satisfies cond, else dest = 0.
++  // POWER10: setbc/setbcr (1 insn). P8/P9: isel-based path with the
++  // r0-as-zero trick on the BranchOnClear half.
++  void ma_cmp_set(Register dest, Condition cond) {
++    uint32_t base = uint32_t(cond) & 0xff;
++    uint32_t setbase = (base & ~BranchOptionMask) | BranchOnSet;
++    if (HasPOWER10()) {
++      if ((base & BranchOptionMask) == BranchOnSet) {
++        as_setbc(dest, setbase, cr0);
++      } else {
++        as_setbcr(dest, setbase, cr0);
++      }
++      return;
++    }
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    xs_li(scratch, 1);
++    if ((base & BranchOptionMask) == BranchOnSet) {
++      xs_li(dest, 0);
++      as_isel(dest, scratch, dest, setbase, cr0);
++    } else {
++      as_isel0(dest, r0, scratch, setbase, cr0);
++    }
++  }
++
++  void ma_cmp_set_dbl(Register dest, DoubleCondition cond) {
++    uint32_t base = uint32_t(cond) & 0xff;
++    bool hasUnorderedFlag = uint32_t(cond) & DoubleConditionUnordered;
++    uint32_t setbase = (base & ~BranchOptionMask) | BranchOnSet;
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    if (HasPOWER10()) {
++      if ((base & BranchOptionMask) == BranchOnSet) {
++        as_setbc(dest, setbase, cr0);
++      } else {
++        as_setbcr(dest, setbase, cr0);
++      }
++      // Fixup paths below still need scratch=1 for the SO-isel.
++      if (hasUnorderedFlag || ((base & BranchOptionMask) != BranchOnSet &&
++                               cond != DoubleOrdered)) {
++        xs_li(scratch, 1);
++      }
++    } else {
++      xs_li(scratch, 1);
++      if ((base & BranchOptionMask) == BranchOnSet) {
++        xs_li(dest, 0);
++        as_isel(dest, scratch, dest, setbase, cr0);
++      } else {
++        as_isel0(dest, r0, scratch, setbase, cr0);
++      }
++    }
++    if (hasUnorderedFlag) {
++      // Condition includes unordered (NaN): force dest=1 when SO is set.
++      // isel dest, scratch(=1), dest, SO
++      as_isel(dest, scratch, dest, uint16_t(SOBit), cr0);
++    } else if ((base & BranchOptionMask) != BranchOnSet &&
++               cond != DoubleOrdered) {
++      // Ordered comparison that negates a CR bit (BranchOnClear): NaN
++      // produces all-zero LT/GT/EQ bits which makes the negation return
++      // true.  Fix by forcing dest=0 when SO is set.
++      as_isel0(dest, r0, dest, uint16_t(SOBit), cr0);
++    }
++  }
++
++  // Conditional move: if CR0 satisfies cond, dest = src.
++  void ma_cmp_move(Register dest, Register src, Condition cond) {
++    uint32_t base = uint32_t(cond) & 0xff;
++    uint32_t setbase = (base & ~BranchOptionMask) | BranchOnSet;
++    if ((base & BranchOptionMask) == BranchOnSet) {
++      as_isel(dest, src, dest, setbase, cr0);
++    } else {
++      as_isel(dest, dest, src, setbase, cr0);
++    }
++  }
++
++  // If cond == 0, move src to dst; otherwise dst is unchanged. The only
++  // callers are wasm select, whose condition is a 32-bit value: test its
++  // 32-bit sign with cmpwi so high-bit garbage (e.g. under register pressure)
++  // does not make a zero condition read as non-zero.
++  void moveIfZero(Register dst, Register src, Register cond) {
++    as_cmpwi(cond, 0);
++    as_isel(dst, src, dst, Equal, cr0);
++  }
++
++  void ma_add32TestCarry(Condition cond, Register rd, Register rs, Imm32 imm,
++                         Label* overflow);
++  void ma_addPtrTestCarry(Condition cond, Register rd, Register rs, ImmWord imm,
++                          Label* overflow);
++
++  // Issue the correct compare instruction for the given condition and
++  // operand sizes. Returns the condition to use with ma_b or ma_cmp_set
++  // (usually the same, but unsigned conditions use cmpl* variants).
++  Condition ma_cmp(Register lhs, Register rhs, Condition cond,
++                   bool is32bit = false) {
++    Condition base =
++        static_cast<Condition>(cond & ~(ConditionUnsigned | ConditionZero));
++    bool isUnsigned = (cond & ConditionUnsigned) != 0;
++    // ConditionZero-flagged conditions (Signed, NotSigned, Zero, NonZero)
++    // test a single register against zero, not two registers against each
++    // other. Compare against immediate 0.
++    if ((cond & ConditionZero) != 0) {
++      if (is32bit) {
++        as_cmpwi(lhs, 0);
++      } else {
++        as_cmpdi(lhs, 0);
++      }
++      return base;
++    }
++    if (is32bit) {
++      if (isUnsigned) {
++        as_cmplw(lhs, rhs);
++      } else {
++        as_cmpw(lhs, rhs);
++      }
++    } else {
++      if (isUnsigned) {
++        as_cmpld(lhs, rhs);
++      } else {
++        as_cmpd(lhs, rhs);
++      }
++    }
++    return base;
++  }
++
++  Condition ma_cmp(Register lhs, Imm32 rhs, Condition cond,
++                   bool is32bit = false) {
++    Condition base =
++        static_cast<Condition>(cond & ~(ConditionUnsigned | ConditionZero));
++    bool isUnsigned = (cond & ConditionUnsigned) != 0;
++    if (isUnsigned) {
++      if (is_uintN(rhs.value, 16)) {
++        if (is32bit) {
++          as_cmplwi(lhs, rhs.value);
++        } else {
++          as_cmpldi(lhs, rhs.value);
++        }
++        return base;
++      }
++    } else {
++      if (is_intN(rhs.value, 16)) {
++        if (is32bit) {
++          as_cmpwi(lhs, rhs.value);
++        } else {
++          as_cmpdi(lhs, rhs.value);
++        }
++        return base;
++      }
++    }
++    // Immediate doesn't fit — materialize into scratch and compare.
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    move32(rhs, scratch);
++    return ma_cmp(lhs, scratch, cond, is32bit);
++  }
++
++  Condition ma_cmp(Register lhs, ImmWord rhs, Condition cond) {
++    Condition base =
++        static_cast<Condition>(cond & ~(ConditionUnsigned | ConditionZero));
++    bool isUnsigned = (cond & ConditionUnsigned) != 0;
++    if (isUnsigned) {
++      if (is_uintN(rhs.value, 16)) {
++        as_cmpldi(lhs, rhs.value);
++        return base;
++      }
++    } else {
++      if (is_intN(rhs.value, 16)) {
++        as_cmpdi(lhs, rhs.value);
++        return base;
++      }
++    }
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    movePtr(rhs, scratch);
++    return ma_cmp(lhs, scratch, cond);
++  }
++
++  Condition ma_cmp(Register lhs, ImmPtr rhs, Condition cond) {
++    return ma_cmp(lhs, ImmWord(uintptr_t(rhs.value)), cond);
++  }
++
++  Condition ma_cmp(Register lhs, ImmGCPtr rhs, Condition cond) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    movePtr(rhs, scratch);
++    return ma_cmp(lhs, scratch, cond);
++  }
++
++  Condition ma_cmp(Register lhs, ImmTag rhs, Condition cond) {
++    // Tag values on PUNBOX64 are 17-bit (0x1FFF0+), too large for 16-bit
++    // signed or unsigned immediates.
++    Condition base =
++        static_cast<Condition>(cond & ~(ConditionUnsigned | ConditionZero));
++    bool isUnsigned = (cond & ConditionUnsigned) != 0;
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    move32(Imm32(rhs.value), scratch);
++    if (isUnsigned) {
++      as_cmpld(lhs, scratch);
++    } else {
++      as_cmpd(lhs, scratch);
++    }
++    return base;
++  }
++
++  // Compare a tag register against an ImmTag constant and branch, WITHOUT
++  // acquiring a scratch register.  Uses xoris+cmplwi which MODIFIES tagReg.
++  // Only safe when tagReg is a scratch register owned by the caller.
++  void branchTestTag(Condition cond, Register tagReg, ImmTag tag, Label* label) {
++    MOZ_ASSERT(cond == Equal || cond == NotEqual);
++    uint32_t t = tag.value;
++    as_xoris(tagReg, tagReg, t >> 16);
++    as_cmplwi(tagReg, t & 0xFFFF);
++    Condition c = (cond == Equal) ? Equal : NotEqual;
++    ma_b(c, label);
++  }
++
++  void ma_mod_mask(Register src, Register dest, Register hold, Register remain,
++                   int32_t shift, Label* negZero = nullptr);
++
++  void nop() { writeInst(NopInst); }
++  void breakpoint(uint32_t value = 0) { xs_trap(); }
++
++  inline void retn(Imm32 n);
++
++  // ===============================================================
++  // Stack operations
++
++  void push(Imm32 imm) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    move32(imm, scratch);
++    push(scratch);
++  }
++  void push(ImmWord imm) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    movePtr(imm, scratch);
++    push(scratch);
++  }
++  void push(ImmGCPtr imm) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    movePtr(imm, scratch);
++    push(scratch);
++  }
++  void push(const Address& address) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    loadPtr(address, scratch);
++    push(scratch);
++  }
++  void push(Register reg) { as_stdu(reg, StackPointer, -8); }
++  void push(FloatRegister reg) {
++    // stfdu/stfsu fuses the SP decrement and the FP store: EA=SP-8,
++    // MEM[EA]=reg, SP=EA. 1 insn instead of addi+stfd/stfs.
++    if (reg.isSingle()) {
++      as_stfsu(reg, StackPointer, -8);
++    } else {
++      as_stfdu(reg, StackPointer, -8);
++    }
++  }
++  void pop(Register reg) {
++    as_ld(reg, StackPointer, 0);
++    as_addi(StackPointer, StackPointer, 8);
++  }
++  void pop(FloatRegister reg) {
++    if (reg.isSingle()) {
++      as_lfs(reg, StackPointer, 0);
++    } else {
++      as_lfd(reg, StackPointer, 0);
++    }
++    as_addi(StackPointer, StackPointer, 8);
++  }
++
++  CodeOffset pushWithPatch(ImmWord imm) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    CodeOffset offset = movWithPatch(imm, scratch);
++    push(scratch);
++    return offset;
++  }
++  CodeOffset movWithPatch(ImmWord imm, Register dest) {
++    BufferOffset bo = emitLoad64Stanza(dest, (uint64_t)imm.value);
++    return CodeOffset(bo.getOffset());
++  }
++  CodeOffset movWithPatch(ImmPtr imm, Register dest) {
++    return movWithPatch(ImmWord(uintptr_t(imm.value)), dest);
++  }
++
++  // ===============================================================
++  // Tag/unbox operations
++
++  void splitTag(Register src, Register dest) {
++    x_srdi(dest, src, JSVAL_TAG_SHIFT);
++  }
++  void splitTag(const ValueOperand& operand, Register dest) {
++    splitTag(operand.valueReg(), dest);
++  }
++  void splitTagForTest(const ValueOperand& value, ScratchTagScope& tag) {
++    splitTag(value, tag);
++  }
++
++  void unboxNonDouble(const ValueOperand& operand, Register dest,
++                      JSValueType type) {
++    unboxNonDouble(operand.valueReg(), dest, type);
++  }
++  template <typename T>
++  void unboxNonDouble(T src, Register dest, JSValueType type) {
++    MOZ_ASSERT(type != JSVAL_TYPE_DOUBLE);
++    if (type == JSVAL_TYPE_INT32 || type == JSVAL_TYPE_BOOLEAN) {
++      load32(src, dest);
++      return;
++    }
++    loadPtr(src, dest);
++    unboxNonDouble(dest, dest, type);
++  }
++  void unboxNonDouble(Register src, Register dest, JSValueType type) {
++    MOZ_ASSERT(type != JSVAL_TYPE_DOUBLE);
++    if (type == JSVAL_TYPE_INT32 || type == JSVAL_TYPE_BOOLEAN) {
++      as_extsw(dest, src);
++      return;
++    }
++    // Extract the payload (lower 47 bits) by clearing the tag.
++    // This avoids acquiring a scratch register, preventing pool exhaustion
++    // when called from nested scratch scopes (e.g., ScratchTagScope →
++    // branchTestStringTruthy → unboxString → here).
++    // rldicl dest, src, 0, 17 — clear upper 17 bits (tag), keep lower 47.
++    as_rldicl(dest, src, 0, 17);
++  }
++  void unboxGCThingForGCBarrier(const Address& src, Register dest) {
++    loadPtr(src, dest);
++    // Clear tag bits (top 17 bits on 64-bit).
++    as_rldicl(dest, dest, 0, 64 - JSVAL_TAG_SHIFT);
++  }
++  void unboxGCThingForGCBarrier(const ValueOperand& src, Register dest) {
++    as_rldicl(dest, src.valueReg(), 0, 64 - JSVAL_TAG_SHIFT);
++  }
++  void unboxWasmAnyRefGCThingForGCBarrier(const Address& src, Register dest) {
++    static_assert(wasm::AnyRef::TagShift == 2);
++    loadPtr(src, dest);
++    as_rldicr(dest, dest, 0, 61);
++  }
++  void getGCThingValueChunk(const Address& src, Register dest) {
++    loadPtr(src, dest);
++    as_rldicl(dest, dest, 0, 17);
++    as_rldicr(dest, dest, 0, 43);
++  }
++  void getGCThingValueChunk(const ValueOperand& src, Register dest) {
++    as_rldicl(dest, src.valueReg(), 0, 17);
++    as_rldicr(dest, dest, 0, 43);
++  }
++
++  void boxDouble(FloatRegister src, const ValueOperand& dest, FloatRegister) {
++    as_mfvsrd(dest.valueReg(), src);
++  }
++  void boxNonDouble(JSValueType type, Register src, const ValueOperand& dest) {
++    boxValue(type, src, dest.valueReg());
++  }
++  void boxNonDouble(Register type, Register src, const ValueOperand& dest) {
++    boxValue(type, src, dest.valueReg());
++  }
++  void unboxInt32(const ValueOperand& operand, Register dest) {
++    as_extsw(dest, operand.valueReg());
++  }
++  void unboxInt32(const Address& src, Register dest) { load32(src, dest); }
++  void unboxInt32(const BaseIndex& src, Register dest) { load32(src, dest); }
++  void unboxBoolean(const ValueOperand& operand, Register dest) {
++    as_extsw(dest, operand.valueReg());
++  }
++  void unboxBoolean(const Address& src, Register dest) { load32(src, dest); }
++  void unboxBoolean(const BaseIndex& src, Register dest) { load32(src, dest); }
++  void unboxDouble(const ValueOperand& operand, FloatRegister dest) {
++    as_mtvsrd(dest, operand.valueReg());
++  }
++  void unboxDouble(const Address& src, FloatRegister dest) {
++    loadDouble(src, dest);
++  }
++  void unboxDouble(const BaseIndex& src, FloatRegister dest) {
++    loadDouble(src, dest);
++  }
++  void unboxString(const ValueOperand& operand, Register dest) {
++    unboxNonDouble(operand, dest, JSVAL_TYPE_STRING);
++  }
++  void unboxString(const Address& src, Register dest) {
++    unboxNonDouble(src, dest, JSVAL_TYPE_STRING);
++  }
++  void unboxSymbol(const ValueOperand& operand, Register dest) {
++    unboxNonDouble(operand, dest, JSVAL_TYPE_SYMBOL);
++  }
++  void unboxSymbol(const Address& src, Register dest) {
++    unboxNonDouble(src, dest, JSVAL_TYPE_SYMBOL);
++  }
++  void unboxBigInt(const ValueOperand& operand, Register dest) {
++    unboxNonDouble(operand, dest, JSVAL_TYPE_BIGINT);
++  }
++  void unboxBigInt(const Address& src, Register dest) {
++    unboxNonDouble(src, dest, JSVAL_TYPE_BIGINT);
++  }
++  void unboxObject(const ValueOperand& src, Register dest) {
++    unboxNonDouble(src, dest, JSVAL_TYPE_OBJECT);
++  }
++  void unboxObject(const Address& src, Register dest) {
++    unboxNonDouble(src, dest, JSVAL_TYPE_OBJECT);
++  }
++  void unboxObject(const BaseIndex& src, Register dest) {
++    unboxNonDouble(src, dest, JSVAL_TYPE_OBJECT);
++  }
++  void unboxValue(const ValueOperand& src, AnyRegister dest, JSValueType type) {
++    if (dest.isFloat()) {
++      unboxDouble(src, dest.fpu());
++    } else {
++      unboxNonDouble(src, dest.gpr(), type);
++    }
++  }
++  void unboxObjectOrNull(const Address& src, Register dest) {
++    loadPtr(src, dest);
++    // Object pointers have the object tag in high bits; null has a different
++    // tag. Clear the top bits to get either a valid pointer or zero.
++    as_rldicl(dest, dest, 0, 64 - JSVAL_TAG_SHIFT);
++  }
++
++  void tagValue(JSValueType type, Register payload, ValueOperand dest) {
++    MOZ_ASSERT(type != JSVAL_TYPE_UNDEFINED && type != JSVAL_TYPE_NULL);
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    MOZ_ASSERT(scratch != payload && scratch != dest.valueReg());
++    tagValueWithScratch(type, payload, dest, scratch);
++  }
++  void tagValueWithScratch(JSValueType type, Register payload,
++                           ValueOperand dest, Register scratch) {
++    movePtr(ImmShiftedTag(type), scratch);
++    if (type == JSVAL_TYPE_INT32 || type == JSVAL_TYPE_BOOLEAN ||
++        type == JSVAL_TYPE_MAGIC) {
++      if (payload != dest.valueReg()) {
++        movePtr(payload, dest.valueReg());
++      }
++      as_rldicl(dest.valueReg(), dest.valueReg(), 0, 32);
++      as_or_(dest.valueReg(), dest.valueReg(), scratch);
++    } else {
++      if (payload != dest.valueReg()) {
++        movePtr(payload, dest.valueReg());
++      }
++      as_or_(dest.valueReg(), dest.valueReg(), scratch);
++    }
++  }
++  void boxValue(JSValueType type, Register src, Register dest) {
++    MOZ_ASSERT(src != dest);
++    MOZ_ASSERT(type != JSVAL_TYPE_UNDEFINED && type != JSVAL_TYPE_NULL);
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    boxValueWithScratch(type, src, dest, scratch);
++  }
++  void boxValueWithScratch(JSValueType type, Register src, Register dest,
++                           Register scratch) {
++    if (type == JSVAL_TYPE_INT32 || type == JSVAL_TYPE_BOOLEAN ||
++        type == JSVAL_TYPE_MAGIC) {
++      as_rldicl(dest, src, 0, 32);
++      movePtr(ImmShiftedTag(type), scratch);
++      as_or_(dest, dest, scratch);
++    } else {
++      movePtr(ImmShiftedTag(type), scratch);
++      xs_mr(dest, src);
++      as_or_(dest, dest, scratch);
++    }
++  }
++  void boxValue(Register type, Register src, Register dest) {
++    MOZ_ASSERT(src != dest);
++
++#ifdef DEBUG
++    Label done, isNullOrUndefined, isBoolean, isInt32OrMagic;
++
++    // Use ma_cmp + ma_b instead of asMasm().branch32() because
++    // MacroAssembler is not yet fully defined at this point.
++    Condition cond;
++    cond = ma_cmp(type, Imm32(JSVAL_TYPE_NULL), Equal, true);
++    ma_b(cond, &isNullOrUndefined);
++    cond = ma_cmp(type, Imm32(JSVAL_TYPE_UNDEFINED), Equal, true);
++    ma_b(cond, &isNullOrUndefined);
++    cond = ma_cmp(type, Imm32(JSVAL_TYPE_BOOLEAN), Equal, true);
++    ma_b(cond, &isBoolean);
++    cond = ma_cmp(type, Imm32(JSVAL_TYPE_INT32), Equal, true);
++    ma_b(cond, &isInt32OrMagic);
++    cond = ma_cmp(type, Imm32(JSVAL_TYPE_MAGIC), Equal, true);
++    ma_b(cond, &isInt32OrMagic);
++    // GCThing types aren't supported, because as_rldicl truncates
++    // payloads above UINT32_MAX.
++    breakpoint();
++    {
++      bind(&isNullOrUndefined);
++
++      // Ensure no payload for null and undefined.
++      cond = ma_cmp(src, ImmWord(0), Equal);
++      ma_b(cond, &done);
++      breakpoint();
++    }
++    {
++      bind(&isBoolean);
++
++      // Ensure boolean values are either 0 or 1.
++      cond = ma_cmp(src, Imm32(1), BelowOrEqual, true);
++      ma_b(cond, &done);
++      breakpoint();
++    }
++    {
++      bind(&isInt32OrMagic);
++
++      // Ensure |src| is sign-extended.
++      UseScratchRegisterScope debugTemps(*this);
++      Register debugScratch = debugTemps.Acquire();
++      as_extsw(debugScratch, src);
++      cond = ma_cmp(src, debugScratch, Equal);
++      ma_b(cond, &done);
++      breakpoint();
++    }
++    bind(&done);
++#endif
++
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    MOZ_ASSERT(scratch != dest && scratch != src && scratch != type);
++    // Build tag: (type | JSVAL_TAG_MAX_DOUBLE) << JSVAL_TAG_SHIFT
++    move32(Imm32(JSVAL_TAG_MAX_DOUBLE), scratch);
++    as_or_(scratch, scratch, type);
++    x_sldi(scratch, scratch, JSVAL_TAG_SHIFT);
++    // Insert 32-bit payload.
++    as_rldicl(dest, src, 0, 32);
++    as_or_(dest, dest, scratch);
++  }
++
++  // ===============================================================
++  // Value store/load/push/pop
++
++  void storeValue(ValueOperand val, const Address& dest) {
++    storePtr(val.valueReg(), dest);
++  }
++  void storeValue(ValueOperand val, const BaseIndex& dest) {
++    storePtr(val.valueReg(), dest);
++  }
++  void storeValue(JSValueType type, Register reg, Address dest) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    MOZ_ASSERT(dest.base != scratch);
++    boxValue(type, reg, scratch);
++    storePtr(scratch, dest);
++  }
++  void storeValue(JSValueType type, Register reg, BaseIndex dest) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    MOZ_ASSERT(dest.base != scratch);
++    boxValue(type, reg, scratch);
++    storePtr(scratch, dest);
++  }
++  void storeValue(const Value& val, Address dest) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    MOZ_ASSERT(dest.base != scratch);
++    if (val.isGCThing()) {
++      CodeOffset off = movWithPatch(ImmWord(val.asRawBits()), scratch);
++      writeDataRelocation(off, val);
++    } else {
++      movePtr(ImmWord(val.asRawBits()), scratch);
++    }
++    storePtr(scratch, dest);
++  }
++  void storeValue(const Value& val, BaseIndex dest) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    MOZ_ASSERT(dest.base != scratch);
++    if (val.isGCThing()) {
++      CodeOffset off = movWithPatch(ImmWord(val.asRawBits()), scratch);
++      writeDataRelocation(off, val);
++    } else {
++      movePtr(ImmWord(val.asRawBits()), scratch);
++    }
++    storePtr(scratch, dest);
++  }
++  void storeValue(const Address& src, const Address& dest, Register temp) {
++    loadPtr(src, temp);
++    storePtr(temp, dest);
++  }
++
++  void storePrivateValue(Register src, const Address& dest) {
++    storePtr(src, dest);
++  }
++  void storePrivateValue(ImmGCPtr imm, const Address& dest) {
++    storePtr(imm, dest);
++  }
++
++  void loadValue(Address src, ValueOperand val) {
++    loadPtr(src, val.valueReg());
++  }
++  void loadValue(const BaseIndex& src, ValueOperand val) {
++    loadPtr(src, val.valueReg());
++  }
++  void loadUnalignedValue(const Address& src, ValueOperand dest) {
++    loadPtr(src, dest.valueReg());
++  }
++
++  void pushValue(ValueOperand val) { push(val.valueReg()); }
++  void popValue(ValueOperand val) { pop(val.valueReg()); }
++  void pushValue(const Value& val) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    if (val.isGCThing()) {
++      CodeOffset off = movWithPatch(ImmWord(val.asRawBits()), scratch);
++      writeDataRelocation(off, val);
++    } else {
++      movePtr(ImmWord(val.asRawBits()), scratch);
++    }
++    push(scratch);
++  }
++  void pushValue(JSValueType type, Register reg) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    boxValue(type, reg, scratch);
++    push(scratch);
++  }
++  void pushValue(const Address& addr) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    loadPtr(addr, scratch);
++    push(scratch);
++  }
++  void pushValue(const BaseIndex& addr, Register scratch) {
++    loadPtr(addr, scratch);
++    push(scratch);
++  }
++
++  // ===============================================================
++  // Load instructions
++
++  FaultingCodeOffset load8SignExtend(const Address& address, Register dest) {
++    FaultingCodeOffset fco;
++    if (is_intN(address.offset, 16)) {
++      fco = FaultingCodeOffset(
++          as_lbz(dest, address.base, address.offset).getOffset());
++    } else {
++      UseScratchRegisterScope temps(*this);
++      Register scratch = temps.Acquire();
++      MOZ_ASSERT(scratch != dest);
++      movePtr(ImmWord(address.offset), scratch);
++      fco =
++          FaultingCodeOffset(as_lbzx(dest, address.base, scratch).getOffset());
++    }
++    as_extsb(dest, dest);
++    return fco;
++  }
++  FaultingCodeOffset load8SignExtend(const BaseIndex& src, Register dest) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    computeScaledAddress(src, scratch);
++    FaultingCodeOffset fco;
++    if (is_intN(src.offset, 16)) {
++      fco = FaultingCodeOffset(as_lbz(dest, scratch, src.offset).getOffset());
++    } else {
++      MOZ_ASSERT(scratch != dest);
++      movePtr(ImmWord(src.offset), dest);
++      fco = FaultingCodeOffset(as_lbzx(dest, scratch, dest).getOffset());
++    }
++    as_extsb(dest, dest);
++    return fco;
++  }
++  FaultingCodeOffset load8ZeroExtend(const Address& address, Register dest) {
++    if (is_intN(address.offset, 16)) {
++      return FaultingCodeOffset(
++          as_lbz(dest, address.base, address.offset).getOffset());
++    }
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    MOZ_ASSERT(scratch != dest);
++    movePtr(ImmWord(address.offset), scratch);
++    return FaultingCodeOffset(as_lbzx(dest, address.base, scratch).getOffset());
++  }
++  FaultingCodeOffset load8ZeroExtend(const BaseIndex& src, Register dest) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    computeScaledAddress(src, scratch);
++    if (is_intN(src.offset, 16)) {
++      return FaultingCodeOffset(as_lbz(dest, scratch, src.offset).getOffset());
++    }
++    MOZ_ASSERT(scratch != dest);
++    movePtr(ImmWord(src.offset), dest);
++    return FaultingCodeOffset(as_lbzx(dest, scratch, dest).getOffset());
++  }
++  FaultingCodeOffset load16SignExtend(const Address& address, Register dest) {
++    if (is_intN(address.offset, 16)) {
++      return FaultingCodeOffset(
++          as_lha(dest, address.base, address.offset).getOffset());
++    }
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    MOZ_ASSERT(scratch != dest);
++    movePtr(ImmWord(address.offset), scratch);
++    return FaultingCodeOffset(as_lhax(dest, address.base, scratch).getOffset());
++  }
++  FaultingCodeOffset load16SignExtend(const BaseIndex& src, Register dest) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    computeScaledAddress(src, scratch);
++    if (is_intN(src.offset, 16)) {
++      return FaultingCodeOffset(as_lha(dest, scratch, src.offset).getOffset());
++    }
++    MOZ_ASSERT(scratch != dest);
++    movePtr(ImmWord(src.offset), dest);
++    return FaultingCodeOffset(as_lhax(dest, scratch, dest).getOffset());
++  }
++  template <typename S>
++  void load16UnalignedSignExtend(const S& src, Register dest) {
++    load16SignExtend(src, dest);
++  }
++  FaultingCodeOffset load16ZeroExtend(const Address& address, Register dest) {
++    if (is_intN(address.offset, 16)) {
++      return FaultingCodeOffset(
++          as_lhz(dest, address.base, address.offset).getOffset());
++    }
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    MOZ_ASSERT(scratch != dest);
++    movePtr(ImmWord(address.offset), scratch);
++    return FaultingCodeOffset(as_lhzx(dest, address.base, scratch).getOffset());
++  }
++  FaultingCodeOffset load16ZeroExtend(const BaseIndex& src, Register dest) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    computeScaledAddress(src, scratch);
++    if (is_intN(src.offset, 16)) {
++      return FaultingCodeOffset(as_lhz(dest, scratch, src.offset).getOffset());
++    }
++    MOZ_ASSERT(scratch != dest);
++    movePtr(ImmWord(src.offset), dest);
++    return FaultingCodeOffset(as_lhzx(dest, scratch, dest).getOffset());
++  }
++  template <typename S>
++  void load16UnalignedZeroExtend(const S& src, Register dest) {
++    load16ZeroExtend(src, dest);
++  }
++
++  FaultingCodeOffset load32(const Address& address, Register dest) {
++    // lwa is DS-form (14-bit displacement × 4 = 16-bit-signed effective
++    // range, 4-byte alignment required). lwax is X-form indexed, no
++    // alignment constraint. Both sign-extend in one instruction; only
++    // the misaligned 16-bit-fitting case still needs lwz + extsw.
++    if (is_intN(address.offset, 16) && (address.offset & 3) == 0) {
++      return FaultingCodeOffset(
++          as_lwa(dest, address.base, address.offset).getOffset());
++    }
++    if (is_intN(address.offset, 16)) {
++      FaultingCodeOffset fco(
++          as_lwz(dest, address.base, address.offset).getOffset());
++      as_extsw(dest, dest);
++      return fco;
++    }
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    MOZ_ASSERT(scratch != dest);
++    movePtr(ImmWord(address.offset), scratch);
++    return FaultingCodeOffset(as_lwax(dest, address.base, scratch).getOffset());
++  }
++  FaultingCodeOffset load32(const BaseIndex& address, Register dest) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    computeScaledAddress(address, scratch);
++    if (is_intN(address.offset, 16) && (address.offset & 3) == 0) {
++      return FaultingCodeOffset(
++          as_lwa(dest, scratch, address.offset).getOffset());
++    }
++    if (is_intN(address.offset, 16)) {
++      FaultingCodeOffset fco(as_lwz(dest, scratch, address.offset).getOffset());
++      as_extsw(dest, dest);
++      return fco;
++    }
++    MOZ_ASSERT(scratch != dest);
++    movePtr(ImmWord(address.offset), dest);
++    return FaultingCodeOffset(as_lwax(dest, scratch, dest).getOffset());
++  }
++  void load32(AbsoluteAddress address, Register dest) {
++    movePtr(ImmWord((uintptr_t)address.addr), dest);
++    as_lwa(dest, dest, 0);
++  }
++  void load32(wasm::SymbolicAddress address, Register dest) {
++    movePtr(address, dest);
++    as_lwa(dest, dest, 0);
++  }
++  template <typename S>
++  void load32Unaligned(const S& src, Register dest) {
++    load32(src, dest);
++  }
++
++  FaultingCodeOffset load64(const Address& address, Register64 dest) {
++    return loadPtr(address, dest.reg);
++  }
++  FaultingCodeOffset load64(const BaseIndex& address, Register64 dest) {
++    return loadPtr(address, dest.reg);
++  }
++  template <typename S>
++  void load64Unaligned(const S& src, Register64 dest) {
++    load64(src, dest);
++  }
++
++  FaultingCodeOffset loadPtr(const Address& address, Register dest) {
++    // as_ld (DS-form) requires 4-byte aligned offset.
++    if (is_intN(address.offset, 16) && !(address.offset & 0x3)) {
++      return FaultingCodeOffset(
++          as_ld(dest, address.base, address.offset).getOffset());
++    }
++    if (HasPOWER10() && is_intN((intptr_t)address.offset, 34)) {
++      return FaultingCodeOffset(
++          as_pld(dest, address.base, (int64_t)address.offset, /*R=*/false)
++              .getOffset());
++    }
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    MOZ_ASSERT(scratch != dest);
++    movePtr(ImmWord(address.offset), scratch);
++    return FaultingCodeOffset(as_ldx(dest, address.base, scratch).getOffset());
++  }
++  FaultingCodeOffset loadPtr(const BaseIndex& src, Register dest) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    computeScaledAddress(src, scratch);
++    if (is_intN(src.offset, 16) && !(src.offset & 0x3)) {
++      return FaultingCodeOffset(as_ld(dest, scratch, src.offset).getOffset());
++    }
++    MOZ_ASSERT(scratch != dest);
++    movePtr(ImmWord(src.offset), dest);
++    return FaultingCodeOffset(as_ldx(dest, scratch, dest).getOffset());
++  }
++  void loadPtr(AbsoluteAddress address, Register dest) {
++    movePtr(ImmWord((uintptr_t)address.addr), dest);
++    as_ld(dest, dest, 0);
++  }
++  void loadPtr(wasm::SymbolicAddress address, Register dest) {
++    movePtr(address, dest);
++    as_ld(dest, dest, 0);
++  }
++
++  void loadPrivate(const Address& address, Register dest) {
++    loadPtr(address, dest);
++  }
++
++  FaultingCodeOffset loadDouble(const Address& addr, FloatRegister dest) {
++    if (is_intN(addr.offset, 16)) {
++      return FaultingCodeOffset(
++          as_lfd(dest, addr.base, addr.offset).getOffset());
++    }
++    if (HasPOWER10() && is_intN((intptr_t)addr.offset, 34)) {
++      return FaultingCodeOffset(
++          as_plfd(dest, addr.base, (int64_t)addr.offset, /*R=*/false)
++              .getOffset());
++    }
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    movePtr(ImmWord(addr.offset), scratch);
++    return FaultingCodeOffset(as_lfdx(dest, addr.base, scratch).getOffset());
++  }
++  FaultingCodeOffset loadDouble(const BaseIndex& src, FloatRegister dest) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    computeScaledAddress(src, scratch);
++    if (is_intN(src.offset, 16)) {
++      return FaultingCodeOffset(as_lfd(dest, scratch, src.offset).getOffset());
++    }
++    Register scratch2 = temps.Acquire();
++    movePtr(ImmWord(src.offset), scratch2);
++    return FaultingCodeOffset(as_lfdx(dest, scratch, scratch2).getOffset());
++  }
++  FaultingCodeOffset loadFloat32(const Address& addr, FloatRegister dest) {
++    if (is_intN(addr.offset, 16)) {
++      return FaultingCodeOffset(
++          as_lfs(dest, addr.base, addr.offset).getOffset());
++    }
++    if (HasPOWER10() && is_intN((intptr_t)addr.offset, 34)) {
++      return FaultingCodeOffset(
++          as_plfs(dest, addr.base, (int64_t)addr.offset, /*R=*/false)
++              .getOffset());
++    }
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    movePtr(ImmWord(addr.offset), scratch);
++    return FaultingCodeOffset(as_lfsx(dest, addr.base, scratch).getOffset());
++  }
++  FaultingCodeOffset loadFloat32(const BaseIndex& src, FloatRegister dest) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    computeScaledAddress(src, scratch);
++    if (is_intN(src.offset, 16)) {
++      return FaultingCodeOffset(as_lfs(dest, scratch, src.offset).getOffset());
++    }
++    Register scratch2 = temps.Acquire();
++    movePtr(ImmWord(src.offset), scratch2);
++    return FaultingCodeOffset(as_lfsx(dest, scratch, scratch2).getOffset());
++  }
++  // Load a FP constant into `dest`.
++  //
++  // +0.0 / +0.0f: `xxlxor dest, dest, dest` (1 insn). No register clobbers.
++  //
++  // POWER9 non-zero: constant pool load via `addpcis r16, hi; lfd/lfs fD,
++  // lo(r16); nop`. 2 real insns + nop, no LR clobber, no Return Address
++  // Stack corruption. lfs auto-expands single-precision to double, so no
++  // separate xscvspdpn step. Clobbers r16 (SavedScratchRegister). Pool
++  // entries are shared across duplicate constants.
++  //
++  // POWER8 non-zero: inline `movePtr + mtvsrd(+xscvspdpn)` path. We do NOT
++  // use the bcl-based pool path on POWER8: bcl clobbers LR and corrupts
++  // the Return Address Stack, which causes catastrophic mispredicts in
++  // hot FP-constant loops (~200x slowdown observed on cmp-bitselect.js).
++  //
++  // Precondition: must not be called inside an `enterNoPool` region when
++  // HasPOWER9() is true (the pool path calls `allocEntry` which asserts
++  // `inhibitPools_ == 0`). Audit-verified that no such call site exists
++  // today; the POWER8 inline path is unaffected.
++  void loadConstantDouble(double dp, FloatRegister dest) {
++    if (mozilla::IsPositiveZero(dp)) {
++      as_xxlxor(dest, dest, dest);
++      return;
++    }
++    if (HasPOWER9()) {
++      loadFromPoolFloat64(dest, dp);
++      return;
++    }
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    union {
++      double d;
++      uint64_t u;
++    } u;
++    u.d = dp;
++    movePtr(ImmWord(u.u), scratch);
++    as_mtvsrd(dest, scratch);
++  }
++  void loadConstantFloat32(float f, FloatRegister dest) {
++    if (mozilla::IsPositiveZero(f)) {
++      as_xxlxor(dest, dest, dest);
++      return;
++    }
++    if (HasPOWER9()) {
++      loadFromPoolFloat32(dest, f);
++      return;
++    }
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    union {
++      float f;
++      uint32_t u;
++    } u;
++    u.f = f;
++    movePtr(ImmWord(u.u), scratch);
++    x_sldi(scratch, scratch, 32);
++    as_mtvsrd(dest, scratch);
++    as_xscvspdpn(dest, dest);
++  }
++
++  void notBoolean(const ValueOperand& val) {
++    as_xori(val.valueReg(), val.valueReg(), 1);
++  }
++
++  [[nodiscard]] Register extractTag(const Address& address, Register scratch) {
++    loadPtr(address, scratch);
++    x_srdi(scratch, scratch, JSVAL_TAG_SHIFT);
++    return scratch;
++  }
++  [[nodiscard]] Register extractTag(const BaseIndex& address,
++                                    Register scratch) {
++    if (scratch == r0) {
++      // r0 cannot be used as a base register in D-form/X-form loads,
++      // so we need a separate temp for the intermediate address.
++      UseScratchRegisterScope temps(*this);
++      Register base = temps.Acquire();
++      computeScaledAddress(address, base);
++      loadPtr(Address(base, address.offset), scratch);
++    } else {
++      // scratch is a pool register (r11/r12) or another GPR that can
++      // serve as a base register, so reuse it for the address computation.
++      computeScaledAddress(address, scratch);
++      loadPtr(Address(scratch, address.offset), scratch);
++    }
++    x_srdi(scratch, scratch, JSVAL_TAG_SHIFT);
++    return scratch;
++  }
++  [[nodiscard]] Register extractTag(const ValueOperand& value,
++                                    Register scratch) {
++    splitTag(value, scratch);
++    return scratch;
++  }
++
++  [[nodiscard]] Register extractObject(const Address& address,
++                                       Register scratch) {
++    loadPtr(address, scratch);
++    as_rldicl(scratch, scratch, 0, 64 - JSVAL_TAG_SHIFT);
++    return scratch;
++  }
++  [[nodiscard]] Register extractObject(const ValueOperand& value,
++                                       Register scratch) {
++    unboxObject(value, scratch);
++    return scratch;
++  }
++  [[nodiscard]] Register extractInt32(const ValueOperand& value,
++                                      Register scratch) {
++    unboxInt32(value, scratch);
++    return scratch;
++  }
++  [[nodiscard]] Register extractString(const ValueOperand& value,
++                                       Register scratch) {
++    unboxString(value, scratch);
++    return scratch;
++  }
++  [[nodiscard]] Register extractSymbol(const ValueOperand& value,
++                                       Register scratch) {
++    unboxSymbol(value, scratch);
++    return scratch;
++  }
++  [[nodiscard]] Register extractBoolean(const ValueOperand& value,
++                                        Register scratch) {
++    unboxBoolean(value, scratch);
++    return scratch;
++  }
++
++  void testObjectSet(Condition cond, const ValueOperand& value, Register dest) {
++    MOZ_ASSERT(cond == Equal || cond == NotEqual);
++    {
++      UseScratchRegisterScope temps(*this);
++      Register tag = temps.Acquire();
++      splitTag(value, tag);
++      uint32_t t = JSVAL_TAG_OBJECT;
++      as_xoris(tag, tag, t >> 16);
++      as_cmplwi(tag, t & 0xFFFF);
++    }
++    ma_cmp_set(dest, cond);
++  }
++  void testUndefinedSet(Condition cond, const ValueOperand& value,
++                        Register dest) {
++    MOZ_ASSERT(cond == Equal || cond == NotEqual);
++    {
++      UseScratchRegisterScope temps(*this);
++      Register tag = temps.Acquire();
++      splitTag(value, tag);
++      // Use xoris+cmplwi to compare without a second scratch.
++      uint32_t t = JSVAL_TAG_UNDEFINED;
++      as_xoris(tag, tag, t >> 16);
++      as_cmplwi(tag, t & 0xFFFF);
++    }
++    ma_cmp_set(dest, cond);
++  }
++  void testNullSet(Condition cond, const ValueOperand& value, Register dest) {
++    MOZ_ASSERT(cond == Equal || cond == NotEqual);
++    {
++      UseScratchRegisterScope temps(*this);
++      Register tag = temps.Acquire();
++      splitTag(value, tag);
++      uint32_t t = JSVAL_TAG_NULL;
++      as_xoris(tag, tag, t >> 16);
++      as_cmplwi(tag, t & 0xFFFF);
++    }
++    ma_cmp_set(dest, cond);
++  }
++
++  BufferOffset ret() {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    as_ld(scratch, StackPointer, 0);
++    as_addi(StackPointer, StackPointer, 8);
++    xs_mtlr(scratch);
++    return as_blr();
++  }
++
++  void j(Label* dest) { jump(dest); }
++
++  void getWasmAnyRefGCThingChunk(Register anyref, Register dest) {
++    static_assert(js::gc::ChunkShift == 20);
++    as_rldicr(dest, anyref, 0, 43);
++  }
++
++  template <typename T>
++  void loadUnboxedValue(const T& address, MIRType type, AnyRegister dest) {
++    if (dest.isFloat()) {
++      loadInt32OrDouble(address, dest.fpu());
++    } else {
++      unboxNonDouble(address, dest.gpr(), ValueTypeFromMIRType(type));
++    }
++  }
++
++  void loadInt32OrDouble(const Address& src, FloatRegister dest);
++  void loadInt32OrDouble(const BaseIndex& addr, FloatRegister dest);
++
++  // ===============================================================
++  // Store instructions
++
++  FaultingCodeOffset store8(Register src, const Address& address) {
++    if (is_intN(address.offset, 16)) {
++      return FaultingCodeOffset(
++          as_stb(src, address.base, address.offset).getOffset());
++    }
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    movePtr(ImmWord(address.offset), scratch);
++    return FaultingCodeOffset(as_stbx(src, address.base, scratch).getOffset());
++  }
++  FaultingCodeOffset store8(Register src, const BaseIndex& address) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    computeScaledAddress(address, scratch);
++    if (is_intN(address.offset, 16)) {
++      return FaultingCodeOffset(
++          as_stb(src, scratch, address.offset).getOffset());
++    }
++    Register scratch2 = temps.Acquire();
++    movePtr(ImmWord(address.offset), scratch2);
++    return FaultingCodeOffset(as_stbx(src, scratch, scratch2).getOffset());
++  }
++  void store8(Imm32 imm, const Address& address) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    move32(imm, scratch);
++    store8(scratch, address);
++  }
++  void store8(Imm32 imm, const BaseIndex& address) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    move32(imm, scratch);
++    store8(scratch, address);
++  }
++
++  FaultingCodeOffset store16(Register src, const Address& address) {
++    if (is_intN(address.offset, 16)) {
++      return FaultingCodeOffset(
++          as_sth(src, address.base, address.offset).getOffset());
++    }
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    movePtr(ImmWord(address.offset), scratch);
++    return FaultingCodeOffset(as_sthx(src, address.base, scratch).getOffset());
++  }
++  FaultingCodeOffset store16(Register src, const BaseIndex& address) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    computeScaledAddress(address, scratch);
++    if (is_intN(address.offset, 16)) {
++      return FaultingCodeOffset(
++          as_sth(src, scratch, address.offset).getOffset());
++    }
++    Register scratch2 = temps.Acquire();
++    movePtr(ImmWord(address.offset), scratch2);
++    return FaultingCodeOffset(as_sthx(src, scratch, scratch2).getOffset());
++  }
++  void store16(Imm32 imm, const Address& address) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    move32(imm, scratch);
++    store16(scratch, address);
++  }
++  void store16(Imm32 imm, const BaseIndex& address) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    move32(imm, scratch);
++    store16(scratch, address);
++  }
++  template <typename T>
++  void store16Unaligned(Register src, const T& dest) {
++    store16(src, dest);
++  }
++
++  FaultingCodeOffset store32(Register src, const Address& address) {
++    if (is_intN(address.offset, 16)) {
++      return FaultingCodeOffset(
++          as_stw(src, address.base, address.offset).getOffset());
++    }
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    movePtr(ImmWord(address.offset), scratch);
++    return FaultingCodeOffset(as_stwx(src, address.base, scratch).getOffset());
++  }
++  FaultingCodeOffset store32(Register src, const BaseIndex& address) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    computeScaledAddress(address, scratch);
++    if (is_intN(address.offset, 16)) {
++      return FaultingCodeOffset(
++          as_stw(src, scratch, address.offset).getOffset());
++    }
++    Register scratch2 = temps.Acquire();
++    movePtr(ImmWord(address.offset), scratch2);
++    return FaultingCodeOffset(as_stwx(src, scratch, scratch2).getOffset());
++  }
++  void store32(Register src, AbsoluteAddress address) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    movePtr(ImmWord((uintptr_t)address.addr), scratch);
++    as_stw(src, scratch, 0);
++  }
++  void store32(Imm32 src, const Address& address) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    move32(src, scratch);
++    store32(scratch, address);
++  }
++  void store32(Imm32 src, const BaseIndex& address) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    move32(src, scratch);
++    store32(scratch, address);
++  }
++  template <typename T>
++  void store32Unaligned(Register src, const T& dest) {
++    store32(src, dest);
++  }
++
++  void store64(Imm64 imm, Address address) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    movePtr(ImmWord(imm.value), scratch);
++    storePtr(scratch, address);
++  }
++  void store64(Imm64 imm, const BaseIndex& address) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    movePtr(ImmWord(imm.value), scratch);
++    storePtr(scratch, address);
++  }
++  FaultingCodeOffset store64(Register64 src, Address address) {
++    return storePtr(src.reg, address);
++  }
++  FaultingCodeOffset store64(Register64 src, const BaseIndex& address) {
++    return storePtr(src.reg, address);
++  }
++  template <typename T>
++  void store64Unaligned(Register64 src, const T& dest) {
++    store64(src, dest);
++  }
++
++  template <typename T>
++  void storePtr(ImmWord imm, T address) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    movePtr(imm, scratch);
++    storePtr(scratch, address);
++  }
++  template <typename T>
++  void storePtr(ImmPtr imm, T address) {
++    storePtr(ImmWord(uintptr_t(imm.value)), address);
++  }
++  template <typename T>
++  void storePtr(ImmGCPtr imm, T address) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    movePtr(imm, scratch);
++    storePtr(scratch, address);
++  }
++  void storePtr(Register src, AbsoluteAddress dest) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    movePtr(ImmWord((uintptr_t)dest.addr), scratch);
++    as_std(src, scratch, 0);
++  }
++  FaultingCodeOffset storePtr(Register src, const Address& address) {
++    // as_std (DS-form) requires 4-byte aligned offset.
++    if (is_intN(address.offset, 16) && !(address.offset & 0x3)) {
++      return FaultingCodeOffset(
++          as_std(src, address.base, address.offset).getOffset());
++    }
++    if (HasPOWER10() && is_intN((intptr_t)address.offset, 34)) {
++      return FaultingCodeOffset(
++          as_pstd(src, address.base, (int64_t)address.offset, /*R=*/false)
++              .getOffset());
++    }
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    movePtr(ImmWord(address.offset), scratch);
++    return FaultingCodeOffset(as_stdx(src, address.base, scratch).getOffset());
++  }
++  FaultingCodeOffset storePtr(Register src, const BaseIndex& address) {
++    UseScratchRegisterScope temps(*this);
++    Register scratch = temps.Acquire();
++    computeScaledAddress(address, scratch);
++    if (is_intN(address.offset, 16) && !(address.offset & 0x3)) {
++      return FaultingCodeOffset(
++          as_std(src, scratch, address.offset).getOffset());
++    }
++    Register scratch2 = temps.Acquire();
++    movePtr(ImmWord(address.offset), scratch2);
++    return FaultingCodeOffset(as_stdx(src, scratch, scratch2).getOffset());
++  }
++
++  // ===============================================================
++  // Misc
++
++  void handleFailureWithHandlerTail(Label* profilerExitTail, Label* bailoutTail,
++                                    uint32_t* returnValueCheckOffset);
++
++  inline void incrementInt32Value(const Address& addr);
++
++  void zeroDouble(FloatRegister reg) { as_xxlxor(reg, reg, reg); }
++
++  void writeCodePointer(CodeLabel* label) {
++    label->patchAt()->bind(currentOffset());
++    label->setLinkMode(CodeLabel::RawPointer);
++    m_buffer.ensureSpace(sizeof(void*));
++    writeInst(-1);
++    writeInst(-1);
++  }
++  void writeDataRelocation(const Value& val) {
++    if (val.isGCThing()) {
++      gc::Cell* cell = val.toGCThing();
++      if (cell && gc::IsInsideNursery(cell)) {
++        embedsNurseryPointers_ = true;
++      }
++      dataRelocations_.writeUnsigned(currentOffset());
++    }
++  }
++  void writeDataRelocation(CodeOffset off, const Value& val) {
++    if (val.isGCThing()) {
++      gc::Cell* cell = val.toGCThing();
++      if (cell && gc::IsInsideNursery(cell)) {
++        embedsNurseryPointers_ = true;
++      }
++      dataRelocations_.writeUnsigned(off.offset());
++    }
++  }
++
++  CodeOffset toggledJump(Label* label) {
++    CodeOffset ret(nextOffset().getOffset());
++    jump(label);
++    return ret;
++  }
++  CodeOffset toggledCall(JitCode* target, bool enabled);
++  // 8 instructions for load64 + mtctr + bctrl = 10 instructions total.
++  static size_t ToggledCallSize(uint8_t* code) { return 10 * sizeof(uint32_t); }
++
++  void checkStackAlignment() {}
++
++  static void calculateAlignedStackPointer(void** stackPointer) {
++    *stackPointer = reinterpret_cast<void*>((uintptr_t(*stackPointer)) &
++                                            ~(ABIStackAlignment - 1));
++  }
++
++  void lea(Operand addr, Register dest) {
++    // x86-ism; on PPC, compute effective address manually.
++    MOZ_CRASH("PPC64: lea not supported; use computeEffectiveAddress");
++  }
++
++  void abiret() { as_blr(); }
++
++  void profilerEnterFrame(Register framePtr, Register scratch);
++  void profilerExitFrame();
++
++  void outOfLineWasmTruncateToInt32Check(
++      FloatRegister input, Register output, MIRType fromType, TruncFlags flags,
++      Label* rejoin, const wasm::TrapSiteDesc& trapSiteDesc);
++  void outOfLineWasmTruncateToInt64Check(
++      FloatRegister input, Register64 output, MIRType fromType,
++      TruncFlags flags, Label* rejoin, const wasm::TrapSiteDesc& trapSiteDesc);
++
++  void wasmLoadImpl(const wasm::MemoryAccessDesc& access, Register memoryBase,
++                    Register ptr, Register ptrScratch, AnyRegister output);
++  void wasmStoreImpl(const wasm::MemoryAccessDesc& access, AnyRegister value,
++                     Register memoryBase, Register ptr, Register ptrScratch);
++  void wasmLoadI64Impl(const wasm::MemoryAccessDesc& access,
++                       Register memoryBase, Register ptr, Register ptrScratch,
++                       Register64 output);
++  void wasmStoreI64Impl(const wasm::MemoryAccessDesc& access, Register64 value,
++                        Register memoryBase, Register ptr, Register ptrScratch);
++
++  // Last-byte probing load to enforce wasm-spec atomicity for multi-byte
++  // wasm accesses on POWER ISA. POWER permits unaligned page-spanning
++  // accesses to commit one half before the other half takes a DSI; wasm
++  // requires atomicity. Touching the last byte of the upcoming access
++  // with a 1-byte lbzx triggers SIGSEGV (→ wasm trap via the signal
++  // handler) before the actual access executes — POWER's precise-
++  // interrupt model guarantees the subsequent access is never
++  // architecturally executed if the probe faults.
++  //
++  // Wasm linear memory is one contiguous mapped region followed by an
++  // mprotect'd guard, so last-byte-mapped ⇒ all-bytes-mapped, and a
++  // single-byte probe is sufficient regardless of access size.
++  //
++  // No-op when HasPOWER9() (real POWER9/POWER10 silicon handles page-
++  // spanning unaligned stores atomically at the µarch level), and when
++  // access size is 1. Never called on the atomic path: atomic ops are
++  // naturally aligned per wasm spec + ISA-enforced lwarx alignment, so
++  // they cannot span pages; misaligned atomics take a precise SIGBUS
++  // before any commit.
++  //
++  // 2 instructions when emitted (addi + lbzx).
++  void wasmProbeLastByte(const wasm::MemoryAccessDesc& access,
++                         Register memoryBase, Register ptr);
++};
++
++typedef MacroAssemblerPPC64Compat MacroAssemblerSpecific;
++
++}  // namespace jit
++}  // namespace js
++
++#endif /* jit_ppc64_MacroAssembler_ppc64_h */
+diff --git a/js/src/jit/ppc64/MoveEmitter-ppc64.cpp b/js/src/jit/ppc64/MoveEmitter-ppc64.cpp
+new file mode 100644
+index 000000000000..989d3f61f121
+--- /dev/null
++++ b/js/src/jit/ppc64/MoveEmitter-ppc64.cpp
+@@ -0,0 +1,357 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#include "jit/ppc64/MoveEmitter-ppc64.h"
++
++#include "jit/MacroAssembler-inl.h"
++
++using namespace js;
++using namespace js::jit;
++
++void MoveEmitterPPC64::breakCycle(const MoveOperand& from,
++                                  const MoveOperand& to, MoveOp::Type type,
++                                  uint32_t slotId) {
++  switch (type) {
++    case MoveOp::FLOAT32:
++      if (to.isMemory()) {
++        ScratchFloat32Scope fpscratch32(masm);
++        masm.loadFloat32(getAdjustedAddress(to), fpscratch32);
++        masm.storeFloat32(fpscratch32, cycleSlot(slotId));
++      } else {
++        masm.storeFloat32(to.floatReg(), cycleSlot(slotId));
++      }
++      break;
++    case MoveOp::DOUBLE:
++      if (to.isMemory()) {
++        ScratchDoubleScope fpscratch64(masm);
++        masm.loadDouble(getAdjustedAddress(to), fpscratch64);
++        masm.storeDouble(fpscratch64, cycleSlot(slotId));
++      } else {
++        masm.storeDouble(to.floatReg(), cycleSlot(slotId));
++      }
++      break;
++    case MoveOp::INT32:
++      if (to.isMemory()) {
++        UseScratchRegisterScope temps(masm);
++        Register scratch = temps.Acquire();
++        masm.load32(getAdjustedAddress(to), scratch);
++        masm.store32(scratch, cycleSlot(0));
++      } else {
++        masm.store32(to.reg(), cycleSlot(0));
++      }
++      break;
++    case MoveOp::GENERAL:
++      if (to.isMemory()) {
++        UseScratchRegisterScope temps(masm);
++        Register scratch = temps.Acquire();
++        masm.loadPtr(getAdjustedAddress(to), scratch);
++        masm.storePtr(scratch, cycleSlot(0));
++      } else {
++        masm.storePtr(to.reg(), cycleSlot(0));
++      }
++      break;
++    case MoveOp::SIMD128:
++      if (to.isMemory()) {
++        ScratchSimd128Scope scratch(masm);
++        masm.loadUnalignedSimd128(getAdjustedAddress(to), scratch);
++        masm.storeUnalignedSimd128(scratch, cycleSlot(slotId));
++      } else {
++        masm.storeUnalignedSimd128(to.floatReg(), cycleSlot(slotId));
++      }
++      break;
++    default:
++      MOZ_CRASH("Unexpected move type");
++  }
++}
++
++void MoveEmitterPPC64::completeCycle(const MoveOperand& from,
++                                     const MoveOperand& to, MoveOp::Type type,
++                                     uint32_t slotId) {
++  switch (type) {
++    case MoveOp::FLOAT32:
++      if (to.isMemory()) {
++        ScratchFloat32Scope fpscratch32(masm);
++        masm.loadFloat32(cycleSlot(slotId), fpscratch32);
++        masm.storeFloat32(fpscratch32, getAdjustedAddress(to));
++      } else {
++        masm.loadFloat32(cycleSlot(slotId), to.floatReg());
++      }
++      break;
++    case MoveOp::DOUBLE:
++      if (to.isMemory()) {
++        ScratchDoubleScope fpscratch64(masm);
++        masm.loadDouble(cycleSlot(slotId), fpscratch64);
++        masm.storeDouble(fpscratch64, getAdjustedAddress(to));
++      } else {
++        masm.loadDouble(cycleSlot(slotId), to.floatReg());
++      }
++      break;
++    case MoveOp::INT32:
++      MOZ_ASSERT(slotId == 0);
++      if (to.isMemory()) {
++        UseScratchRegisterScope temps(masm);
++        Register scratch = temps.Acquire();
++        masm.load32(cycleSlot(0), scratch);
++        masm.store32(scratch, getAdjustedAddress(to));
++      } else {
++        masm.load32(cycleSlot(0), to.reg());
++      }
++      break;
++    case MoveOp::GENERAL:
++      MOZ_ASSERT(slotId == 0);
++      if (to.isMemory()) {
++        UseScratchRegisterScope temps(masm);
++        Register scratch = temps.Acquire();
++        masm.loadPtr(cycleSlot(0), scratch);
++        masm.storePtr(scratch, getAdjustedAddress(to));
++      } else {
++        masm.loadPtr(cycleSlot(0), to.reg());
++      }
++      break;
++    case MoveOp::SIMD128:
++      if (to.isMemory()) {
++        ScratchSimd128Scope scratch(masm);
++        masm.loadUnalignedSimd128(cycleSlot(slotId), scratch);
++        masm.storeUnalignedSimd128(scratch, getAdjustedAddress(to));
++      } else {
++        masm.loadUnalignedSimd128(cycleSlot(slotId), to.floatReg());
++      }
++      break;
++    default:
++      MOZ_CRASH("Unexpected move type");
++  }
++}
++
++void MoveEmitterPPC64::emit(const MoveResolver& moves) {
++  if (moves.numCycles()) {
++    // SpillSlotSize must be wide enough for the widest cycled value
++    // (SIMD128 = 16 bytes). The stride below assumes the same. See
++    // Architecture-ppc64.h for the rationale.
++    static_assert(SpillSlotSize == 16);
++    masm.reserveStack(moves.numCycles() * SpillSlotSize);
++    pushedAtCycle_ = masm.framePushed();
++  }
++
++  for (size_t i = 0; i < moves.numMoves(); i++) {
++    emit(moves.getMove(i));
++  }
++}
++
++Address MoveEmitterPPC64::cycleSlot(uint32_t slot, uint32_t subslot) const {
++  int32_t offset = masm.framePushed() - pushedAtCycle_;
++  // Stride must match the per-cycle reservation in emit(); using a
++  // narrower stride causes adjacent SIMD128 slots to overlap.
++  return Address(StackPointer, offset + slot * SpillSlotSize + subslot);
++}
++
++int32_t MoveEmitterPPC64::getAdjustedOffset(const MoveOperand& operand) {
++  MOZ_ASSERT(operand.isMemoryOrEffectiveAddress());
++  if (operand.base() != StackPointer) {
++    return operand.disp();
++  }
++
++  return operand.disp() + masm.framePushed() - pushedAtStart_;
++}
++
++Address MoveEmitterPPC64::getAdjustedAddress(const MoveOperand& operand) {
++  return Address(operand.base(), getAdjustedOffset(operand));
++}
++
++void MoveEmitterPPC64::emitMove(const MoveOperand& from,
++                                const MoveOperand& to) {
++  if (from.isGeneralReg()) {
++    if (to.isGeneralReg()) {
++      masm.movePtr(from.reg(), to.reg());
++    } else if (to.isMemory()) {
++      masm.storePtr(from.reg(), getAdjustedAddress(to));
++    } else {
++      MOZ_CRASH("Invalid emitMove arguments.");
++    }
++  } else if (from.isMemory()) {
++    if (to.isGeneralReg()) {
++      masm.loadPtr(getAdjustedAddress(from), to.reg());
++    } else if (to.isMemory()) {
++      UseScratchRegisterScope temps(masm);
++      Register scratch = temps.Acquire();
++      masm.loadPtr(getAdjustedAddress(from), scratch);
++      masm.storePtr(scratch, getAdjustedAddress(to));
++    } else {
++      MOZ_CRASH("Invalid emitMove arguments.");
++    }
++  } else if (from.isEffectiveAddress()) {
++    if (to.isGeneralReg()) {
++      masm.computeEffectiveAddress(getAdjustedAddress(from), to.reg());
++    } else if (to.isMemory()) {
++      UseScratchRegisterScope temps(masm);
++      Register scratch = temps.Acquire();
++      masm.computeEffectiveAddress(getAdjustedAddress(from), scratch);
++      masm.storePtr(scratch, getAdjustedAddress(to));
++    } else {
++      MOZ_CRASH("Invalid emitMove arguments.");
++    }
++  } else {
++    MOZ_CRASH("Invalid emitMove arguments.");
++  }
++}
++
++void MoveEmitterPPC64::emitInt32Move(const MoveOperand& from,
++                                     const MoveOperand& to) {
++  if (from.isGeneralReg()) {
++    if (to.isGeneralReg()) {
++      masm.move32(from.reg(), to.reg());
++    } else if (to.isMemory()) {
++      masm.store32(from.reg(), getAdjustedAddress(to));
++    } else {
++      MOZ_CRASH("Invalid emitInt32Move arguments.");
++    }
++  } else if (from.isMemory()) {
++    if (to.isGeneralReg()) {
++      masm.load32(getAdjustedAddress(from), to.reg());
++    } else if (to.isMemory()) {
++      UseScratchRegisterScope temps(masm);
++      Register scratch = temps.Acquire();
++      masm.load32(getAdjustedAddress(from), scratch);
++      masm.store32(scratch, getAdjustedAddress(to));
++    } else {
++      MOZ_CRASH("Invalid emitInt32Move arguments.");
++    }
++  } else if (from.isEffectiveAddress()) {
++    if (to.isGeneralReg()) {
++      masm.computeEffectiveAddress(getAdjustedAddress(from), to.reg());
++    } else if (to.isMemory()) {
++      UseScratchRegisterScope temps(masm);
++      Register scratch = temps.Acquire();
++      masm.computeEffectiveAddress(getAdjustedAddress(from), scratch);
++      masm.store32(scratch, getAdjustedAddress(to));
++    } else {
++      MOZ_CRASH("Invalid emitInt32Move arguments.");
++    }
++  } else {
++    MOZ_CRASH("Invalid emitInt32Move arguments.");
++  }
++}
++
++void MoveEmitterPPC64::emitFloat32Move(const MoveOperand& from,
++                                       const MoveOperand& to) {
++  if (from.isFloatReg()) {
++    if (to.isFloatReg()) {
++      masm.moveFloat32(from.floatReg(), to.floatReg());
++    } else {
++      MOZ_ASSERT(to.isMemory());
++      masm.storeFloat32(from.floatReg(), getAdjustedAddress(to));
++    }
++  } else if (to.isFloatReg()) {
++    MOZ_ASSERT(from.isMemory());
++    masm.loadFloat32(getAdjustedAddress(from), to.floatReg());
++  } else {
++    MOZ_ASSERT(from.isMemory());
++    MOZ_ASSERT(to.isMemory());
++    ScratchFloat32Scope fpscratch32(masm);
++    masm.loadFloat32(getAdjustedAddress(from), fpscratch32);
++    masm.storeFloat32(fpscratch32, getAdjustedAddress(to));
++  }
++}
++
++void MoveEmitterPPC64::emitDoubleMove(const MoveOperand& from,
++                                      const MoveOperand& to) {
++  if (from.isFloatReg()) {
++    if (to.isFloatReg()) {
++      masm.moveDouble(from.floatReg(), to.floatReg());
++    } else if (to.isGeneralReg()) {
++      // FPR -> GPR: use mfvsrd directly.
++      masm.as_mfvsrd(to.reg(), from.floatReg());
++    } else {
++      MOZ_ASSERT(to.isMemory());
++      masm.storeDouble(from.floatReg(), getAdjustedAddress(to));
++    }
++  } else if (to.isFloatReg()) {
++    if (from.isMemory()) {
++      masm.loadDouble(getAdjustedAddress(from), to.floatReg());
++    } else {
++      // GPR -> FPR: use mtvsrd directly.
++      masm.as_mtvsrd(to.floatReg(), from.reg());
++    }
++  } else {
++    MOZ_ASSERT(from.isMemory());
++    MOZ_ASSERT(to.isMemory());
++    ScratchDoubleScope fpscratch64(masm);
++    masm.loadDouble(getAdjustedAddress(from), fpscratch64);
++    masm.storeDouble(fpscratch64, getAdjustedAddress(to));
++  }
++}
++
++void MoveEmitterPPC64::emitSimd128Move(const MoveOperand& from,
++                                       const MoveOperand& to) {
++  if (from.isFloatReg()) {
++    if (to.isFloatReg()) {
++      masm.moveSimd128(from.floatReg(), to.floatReg());
++    } else {
++      MOZ_ASSERT(to.isMemory());
++      masm.storeUnalignedSimd128(from.floatReg(), getAdjustedAddress(to));
++    }
++  } else if (to.isFloatReg()) {
++    MOZ_ASSERT(from.isMemory());
++    masm.loadUnalignedSimd128(getAdjustedAddress(from), to.floatReg());
++  } else {
++    MOZ_ASSERT(from.isMemory());
++    MOZ_ASSERT(to.isMemory());
++    ScratchSimd128Scope scratch(masm);
++    masm.loadUnalignedSimd128(getAdjustedAddress(from), scratch);
++    masm.storeUnalignedSimd128(scratch, getAdjustedAddress(to));
++  }
++}
++
++void MoveEmitterPPC64::emit(const MoveOp& move) {
++  const MoveOperand& from = move.from();
++  const MoveOperand& to = move.to();
++
++  if (move.isCycleEnd() && move.isCycleBegin()) {
++    breakCycle(from, to, move.endCycleType(), move.cycleBeginSlot());
++    completeCycle(from, to, move.type(), move.cycleEndSlot());
++    return;
++  }
++
++  if (move.isCycleEnd()) {
++    MOZ_ASSERT(inCycle_);
++    completeCycle(from, to, move.type(), move.cycleEndSlot());
++    MOZ_ASSERT(inCycle_ > 0);
++    inCycle_--;
++    return;
++  }
++
++  if (move.isCycleBegin()) {
++    breakCycle(from, to, move.endCycleType(), move.cycleBeginSlot());
++    inCycle_++;
++  }
++
++  switch (move.type()) {
++    case MoveOp::FLOAT32:
++      emitFloat32Move(from, to);
++      break;
++    case MoveOp::DOUBLE:
++      emitDoubleMove(from, to);
++      break;
++    case MoveOp::SIMD128:
++      emitSimd128Move(from, to);
++      break;
++    case MoveOp::INT32:
++      emitInt32Move(from, to);
++      break;
++    case MoveOp::GENERAL:
++      emitMove(from, to);
++      break;
++    default:
++      MOZ_CRASH("Unexpected move type");
++  }
++}
++
++void MoveEmitterPPC64::assertDone() { MOZ_ASSERT(inCycle_ == 0); }
++
++void MoveEmitterPPC64::finish() {
++  assertDone();
++
++  masm.freeStack(masm.framePushed() - pushedAtStart_);
++}
+diff --git a/js/src/jit/ppc64/MoveEmitter-ppc64.h b/js/src/jit/ppc64/MoveEmitter-ppc64.h
+new file mode 100644
+index 000000000000..a9faa34de6bb
+--- /dev/null
++++ b/js/src/jit/ppc64/MoveEmitter-ppc64.h
+@@ -0,0 +1,64 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_MoveEmitter_ppc64_h
++#define jit_ppc64_MoveEmitter_ppc64_h
++
++#include "jit/MacroAssembler.h"
++#include "jit/MoveResolver.h"
++
++namespace js {
++namespace jit {
++
++class MoveEmitterPPC64 {
++  void emitDoubleMove(const MoveOperand& from, const MoveOperand& to);
++  void emitSimd128Move(const MoveOperand& from, const MoveOperand& to);
++  void breakCycle(const MoveOperand& from, const MoveOperand& to,
++                  MoveOp::Type type, uint32_t slot);
++  void completeCycle(const MoveOperand& from, const MoveOperand& to,
++                     MoveOp::Type type, uint32_t slot);
++
++ protected:
++  uint32_t inCycle_;
++  MacroAssembler& masm;
++
++  uint32_t pushedAtStart_;
++
++  int32_t pushedAtCycle_;
++
++  void assertDone();
++  Address cycleSlot(uint32_t slot, uint32_t subslot = 0) const;
++  int32_t getAdjustedOffset(const MoveOperand& operand);
++  Address getAdjustedAddress(const MoveOperand& operand);
++
++  void emitMove(const MoveOperand& from, const MoveOperand& to);
++  void emitInt32Move(const MoveOperand& from, const MoveOperand& to);
++  void emitFloat32Move(const MoveOperand& from, const MoveOperand& to);
++  void emit(const MoveOp& move);
++
++ public:
++  explicit MoveEmitterPPC64(MacroAssembler& masm)
++      : inCycle_(0),
++        masm(masm),
++        pushedAtStart_(masm.framePushed()),
++        pushedAtCycle_(-1) {}
++
++  ~MoveEmitterPPC64() { assertDone(); }
++
++  void emit(const MoveResolver& moves);
++  void finish();
++  // setScratchRegister is part of the cross-arch MoveEmitter interface
++  // but we never spill, so there's no scratch to set. No-op kept for
++  // shared-code compatibility.
++  void setScratchRegister(Register reg) {}
++};
++
++typedef MoveEmitterPPC64 MoveEmitter;
++
++}  // namespace jit
++}  // namespace js
++
++#endif /* jit_ppc64_MoveEmitter_ppc64_h */
+diff --git a/js/src/jit/ppc64/SharedICHelpers-ppc64-inl.h b/js/src/jit/ppc64/SharedICHelpers-ppc64-inl.h
+new file mode 100644
+index 000000000000..aa874dfd6732
+--- /dev/null
++++ b/js/src/jit/ppc64/SharedICHelpers-ppc64-inl.h
+@@ -0,0 +1,83 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_SharedICHelpers_ppc64_inl_h
++#define jit_ppc64_SharedICHelpers_ppc64_inl_h
++
++#include "jit/BaselineFrame.h"
++#include "jit/SharedICHelpers.h"
++
++#include "jit/MacroAssembler-inl.h"
++
++namespace js {
++namespace jit {
++
++inline void EmitBaselineTailCallVM(TrampolinePtr target, MacroAssembler& masm,
++                                   uint32_t argSize) {
++#ifdef DEBUG
++  Register scratch = R2.scratchReg();
++
++  // Compute frame size.
++  masm.movePtr(FramePointer, scratch);
++  masm.subPtr(StackPointer, scratch);
++
++  // Store frame size without VMFunction arguments for debug assertions.
++  masm.subPtr(Imm32(argSize), scratch);
++  Address frameSizeAddr(FramePointer,
++                        BaselineFrame::reverseOffsetOfDebugFrameSize());
++  masm.store32(scratch, frameSizeAddr);
++  masm.addPtr(Imm32(argSize), scratch);
++#endif
++
++  // Push frame descriptor and perform the tail call.
++  masm.push(FrameDescriptor(FrameType::BaselineJS));
++
++  // The return address is in LR (set by the original bl/bctrl call).
++  // The VMWrapper code will push it via pushReturnAddress().
++
++  masm.jump(target);
++}
++
++inline void EmitBaselineCallVM(TrampolinePtr target, MacroAssembler& masm) {
++  masm.push(FrameDescriptor(FrameType::BaselineStub));
++  masm.call(target);
++}
++
++inline void EmitBaselineEnterStubFrame(MacroAssembler& masm, Register scratch) {
++  MOZ_ASSERT(scratch != ICTailCallReg);
++
++#ifdef DEBUG
++  // Compute frame size.
++  masm.movePtr(FramePointer, scratch);
++  masm.subPtr(StackPointer, scratch);
++
++  Address frameSizeAddr(FramePointer,
++                        BaselineFrame::reverseOffsetOfDebugFrameSize());
++  masm.store32(scratch, frameSizeAddr);
++#endif
++
++  // Note: when making changes here, don't forget to update
++  // BaselineStubFrame if needed.
++
++  // Push frame descriptor and return address.
++  // LR holds the return address; read it into ICTailCallReg to push.
++  masm.Push(FrameDescriptor(FrameType::BaselineJS));
++  masm.xs_mflr(ICTailCallReg);
++  masm.Push(ICTailCallReg);
++
++  // Save old frame pointer, stack pointer and stub reg.
++  masm.Push(FramePointer);
++  masm.movePtr(StackPointer, FramePointer);
++  masm.Push(ICStubReg);
++
++  // Stack should remain aligned.
++  masm.assertStackAlignment(sizeof(Value), 0);
++}
++
++}  // namespace jit
++}  // namespace js
++
++#endif /* jit_ppc64_SharedICHelpers_ppc64_inl_h */
+diff --git a/js/src/jit/ppc64/SharedICHelpers-ppc64.h b/js/src/jit/ppc64/SharedICHelpers-ppc64.h
+new file mode 100644
+index 000000000000..31ba830d2609
+--- /dev/null
++++ b/js/src/jit/ppc64/SharedICHelpers-ppc64.h
+@@ -0,0 +1,97 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_SharedICHelpers_ppc64_h
++#define jit_ppc64_SharedICHelpers_ppc64_h
++
++#include "jit/BaselineIC.h"
++#include "jit/JitFrames.h"
++#include "jit/MacroAssembler.h"
++#include "jit/SharedICRegisters.h"
++
++namespace js {
++namespace jit {
++
++// Distance from sp to the top Value inside an IC stub (no return address on
++// the stack on PPC64).
++static const size_t ICStackValueOffset = 0;
++
++struct BaselineStubFrame {
++  uintptr_t savedFrame;
++  uintptr_t savedStub;
++  uintptr_t returnAddress;
++  uintptr_t descriptor;
++};
++
++inline void EmitRestoreTailCallReg(MacroAssembler& masm) {
++  // On PPC64, LR always holds the return address after a bl/bctrl call.
++  // No-op: LR is the hardware link register, not a GPR on the stack.
++}
++
++inline void EmitRepushTailCallReg(MacroAssembler& masm) {
++  // No-op: LR already holds the return address.
++}
++
++inline void EmitCallIC(MacroAssembler& masm, CodeOffset* callOffset) {
++  // The stub pointer must already be in ICStubReg.
++  // Load stubcode pointer from the ICStub.
++  // R2 won't be active when we call ICs, so we can use it as scratch.
++  masm.loadPtr(Address(ICStubReg, ICStub::offsetOfStubCode()), R2.scratchReg());
++
++  // Call the stubcode. On PPC64 call(Register) emits mtctr + bctrl,
++  // which sets LR to the address after bctrl.
++  masm.call(R2.scratchReg());
++  *callOffset = CodeOffset(masm.currentOffset());
++}
++
++inline void EmitReturnFromIC(MacroAssembler& masm) {
++  // Return via hardware LR (set by the original bl/bctrl call).
++  masm.as_blr();
++}
++
++inline void EmitBaselineLeaveStubFrame(MacroAssembler& masm) {
++  masm.loadPtr(
++      Address(FramePointer, BaselineStubFrameLayout::ICStubOffsetFromFP),
++      ICStubReg);
++
++  masm.movePtr(FramePointer, StackPointer);
++  masm.Pop(FramePointer);
++
++  // Load the return address and restore it to LR.
++  masm.Pop(ICTailCallReg);
++  masm.xs_mtlr(ICTailCallReg);
++
++  // Discard the frame descriptor.
++  {
++    UseScratchRegisterScope temps(masm);
++    Register scratch = temps.Acquire();
++    masm.Pop(scratch);
++  }
++}
++
++template <typename AddrType>
++inline void EmitPreBarrier(MacroAssembler& masm, const AddrType& addr,
++                           MIRType type) {
++  // On PPC64, LR is clobbered by guardedCallPreBarrier. Save it first.
++  masm.xs_mflr(r0);
++  masm.push(r0);
++  masm.guardedCallPreBarrier(addr, type);
++  masm.pop(r0);
++  masm.xs_mtlr(r0);
++}
++
++inline void EmitStubGuardFailure(MacroAssembler& masm) {
++  // Load next stub into ICStubReg.
++  masm.loadPtr(Address(ICStubReg, ICCacheIRStub::offsetOfNext()), ICStubReg);
++
++  // Return address is in LR. Jump to the next stubcode.
++  masm.jump(Address(ICStubReg, ICStub::offsetOfStubCode()));
++}
++
++}  // namespace jit
++}  // namespace js
++
++#endif /* jit_ppc64_SharedICHelpers_ppc64_h */
+diff --git a/js/src/jit/ppc64/SharedICRegisters-ppc64.h b/js/src/jit/ppc64/SharedICRegisters-ppc64.h
+new file mode 100644
+index 000000000000..ddf67342f855
+--- /dev/null
++++ b/js/src/jit/ppc64/SharedICRegisters-ppc64.h
+@@ -0,0 +1,46 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_SharedICRegisters_ppc64_h
++#define jit_ppc64_SharedICRegisters_ppc64_h
++
++#include "jit/ppc64/Assembler-ppc64.h"
++#include "jit/Registers.h"
++#include "jit/RegisterSets.h"
++
++namespace js {
++namespace jit {
++
++// ValueOperands R0, R1, and R2.
++// R0 == JSReturnReg, and R2 uses registers not preserved across calls. R1 value
++// should be preserved across calls.
++static constexpr ValueOperand R0(r5);
++static constexpr ValueOperand R1(r15);
++static constexpr ValueOperand R2(r4);
++
++// ICTailCallReg and ICStubReg.
++// On PPC64, LR is not a GPR, so ICTailCallReg must be a normal GPR.
++// PPC64 ELFv2 has no volatile non-arg GPRs (r3-r10 are all arg regs), so
++// using an arg register risks clobbering by ABI calls with enough arguments.
++// We use callee-saved registers instead, matching MIPS64/RISC-V strategy.
++// These are excluded from BaselineICAvailableGeneralRegs.
++static constexpr Register ICTailCallReg = r27;
++static constexpr Register ICStubReg = r26;
++
++// FloatReg0 must be equal to ReturnFloatReg.
++static constexpr FloatRegister FloatReg0 = {FloatRegisters::f1,
++                                            FloatRegisters::Double};
++static constexpr FloatRegister FloatReg1 = {FloatRegisters::f2,
++                                            FloatRegisters::Double};
++static constexpr FloatRegister FloatReg2 = {FloatRegisters::f3,
++                                            FloatRegisters::Double};
++static constexpr FloatRegister FloatReg3 = {FloatRegisters::f4,
++                                            FloatRegisters::Double};
++
++}  // namespace jit
++}  // namespace js
++
++#endif /* jit_ppc64_SharedICRegisters_ppc64_h */
+diff --git a/js/src/jit/ppc64/Simulator-ppc64.cpp b/js/src/jit/ppc64/Simulator-ppc64.cpp
+new file mode 100644
+index 000000000000..8b29eb3add04
+--- /dev/null
++++ b/js/src/jit/ppc64/Simulator-ppc64.cpp
+@@ -0,0 +1,7296 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#include "jit/ppc64/Simulator-ppc64.h"
++
++#include <cinttypes>
++#include <cmath>
++#include <cstring>
++#include <float.h>
++#include <limits>
++
++#include "jit/AtomicOperations.h"
++#include "jit/ppc64/Assembler-ppc64.h"
++#include "js/Conversions.h"
++#include "threading/LockGuard.h"
++#include "vm/Float16.h"
++#include "vm/JSContext.h"
++#include "vm/Runtime.h"
++#include "wasm/WasmInstance.h"
++#include "wasm/WasmSignalHandlers.h"
++
++#define I8(v) static_cast<int8_t>(v)
++#define I16(v) static_cast<int16_t>(v)
++#define U16(v) static_cast<uint16_t>(v)
++#define I32(v) static_cast<int32_t>(v)
++#define U32(v) static_cast<uint32_t>(v)
++#define I64(v) static_cast<int64_t>(v)
++#define U64(v) static_cast<uint64_t>(v)
++#define I128(v) static_cast<__int128_t>(v)
++#define U128(v) static_cast<__uint128_t>(v)
++
++namespace js {
++namespace jit {
++
++static int64_t MultiplyHighSigned(int64_t u, int64_t v) {
++  uint64_t u0, v0, w0;
++  int64_t u1, v1, w1, w2, t;
++
++  u0 = u & 0xFFFFFFFFL;
++  u1 = u >> 32;
++  v0 = v & 0xFFFFFFFFL;
++  v1 = v >> 32;
++
++  w0 = u0 * v0;
++  t = u1 * v0 + (w0 >> 32);
++  w1 = t & 0xFFFFFFFFL;
++  w2 = t >> 32;
++  w1 = u0 * v1 + w1;
++
++  return u1 * v1 + w2 + (w1 >> 32);
++}
++
++static uint64_t MultiplyHighUnsigned(uint64_t u, uint64_t v) {
++  uint64_t u0, v0, w0;
++  uint64_t u1, v1, w1, w2, t;
++
++  u0 = u & 0xFFFFFFFFL;
++  u1 = u >> 32;
++  v0 = v & 0xFFFFFFFFL;
++  v1 = v >> 32;
++
++  w0 = u0 * v0;
++  t = u1 * v0 + (w0 >> 32);
++  w1 = t & 0xFFFFFFFFL;
++  w2 = t >> 32;
++  w1 = u0 * v1 + w1;
++
++  return u1 * v1 + w2 + (w1 >> 32);
++}
++
++inline constexpr uint32_t RotateLeft32(uint32_t value, uint32_t shift) {
++  return (value << shift) | (value >> ((32 - shift) & 31));
++}
++
++inline constexpr uint64_t RotateLeft64(uint64_t value, uint64_t shift) {
++  return (value << shift) | (value >> ((64 - shift) & 63));
++}
++
++// Generate a 64-bit mask with bits mb..me set (PPC numbering: 0 = MSB = bit
++// 63 in C).  When mb <= me, a contiguous range is set; when mb > me, the
++// mask wraps around (bits 0..me and mb..63 are set).
++static inline uint64_t MASK64(unsigned mb, unsigned me) {
++  MOZ_ASSERT(mb < 64 && me < 64);
++  uint64_t mask_begin = ~0ULL >> mb;
++  uint64_t mask_end = ~0ULL << (63 - me);
++  if (mb <= me) {
++    return mask_begin & mask_end;
++  }
++  return mask_begin | mask_end;
++}
++
++static inline uint32_t MASK32(unsigned mb, unsigned me) {
++  MOZ_ASSERT(mb < 32 && me < 32);
++  uint32_t mask_begin = ~0U >> mb;
++  uint32_t mask_end = ~0U << (31 - me);
++  if (mb <= me) {
++    return mask_begin & mask_end;
++  }
++  return mask_begin | mask_end;
++}
++
++// Count leading zeros.
++static inline int CountLeadingZeros64(uint64_t value) {
++  if (value == 0) return 64;
++  return __builtin_clzll(value);
++}
++
++static inline int CountLeadingZeros32(uint32_t value) {
++  if (value == 0) return 32;
++  return __builtin_clz(value);
++}
++
++static inline int CountTrailingZeros64(uint64_t value) {
++  if (value == 0) return 64;
++  return __builtin_ctzll(value);
++}
++
++static inline int CountTrailingZeros32(uint32_t value) {
++  if (value == 0) return 32;
++  return __builtin_ctz(value);
++}
++
++static inline int PopCount64(uint64_t value) {
++  return __builtin_popcountll(value);
++}
++
++static inline int PopCount32(uint32_t value) {
++  return __builtin_popcount(value);
++}
++
++static inline uint64_t PopCountPerByte(uint64_t value) {
++  uint64_t result = 0;
++  for (int i = 0; i < 8; i++) {
++    uint8_t byte = (value >> (i * 8)) & 0xFF;
++    result |= (uint64_t)__builtin_popcount(byte) << (i * 8);
++  }
++  return result;
++}
++
++// PPC64 C argument slots: PPC64 ELFv2 ABI does not require C argument
++// slots on the stack for register-passed arguments, but we reserve the
++// link area (32 bytes).
++const int kCArgSlotCount = 0;
++const int kCArgsSlotsSize = kCArgSlotCount * sizeof(uintptr_t);
++
++// -----------------------------------------------------------------------------
++// PPC64 SimInstruction.
++
++class SimInstruction {
++ public:
++  enum {
++    kInstrSize = 4,
++    kPCReadOffset = 0
++  };
++
++  inline Instr instructionBits() const {
++    return *reinterpret_cast<const Instr*>(this);
++  }
++
++  inline void setInstructionBits(Instr value) {
++    *reinterpret_cast<Instr*>(this) = value;
++  }
++
++  inline int bit(int nr) const { return (instructionBits() >> nr) & 1; }
++
++  inline uint32_t bits(int hi, int lo) const {
++    return (instructionBits() >> lo) & ((2U << (hi - lo)) - 1);
++  }
++
++  inline uint32_t opcode() const { return bits(31, 26); }
++
++  inline uint32_t rtValue() const { return bits(25, 21); }
++  inline uint32_t rsValue() const { return bits(25, 21); }
++  inline uint32_t raValue() const { return bits(20, 16); }
++  inline uint32_t rbValue() const { return bits(15, 11); }
++  inline uint32_t rcValue() const { return bits(10, 6); }
++
++  inline uint32_t boValue() const { return bits(25, 21); }
++  inline uint32_t biValue() const { return bits(20, 16); }
++
++  // D-form 16-bit immediate (sign-extend to get signed value).
++  inline int16_t imm16Value() const { return I16(bits(15, 0)); }
++  inline uint16_t uimm16Value() const { return U16(bits(15, 0)); }
++
++  // DS-form 14-bit displacement (bits 2..15, 4-byte aligned).
++  inline int16_t ds14Value() const {
++    return I16(bits(15, 2) << 2);
++  }
++
++  // B-form 14-bit branch displacement (bits 2..15, 4-byte aligned).
++  inline int32_t bd16Value() const {
++    int16_t raw = I16(bits(15, 2) << 2);
++    return (int32_t)raw;
++  }
++
++  // I-form 24-bit branch offset (bits 2..25, sign-extended, 4-byte aligned).
++  inline int32_t li26Value() const {
++    int32_t raw = I32(bits(25, 2) << 2);
++    // Sign-extend from 26 bits.
++    return (raw << 6) >> 6;
++  }
++
++  // Extended opcode for X-form / XO-form (bits 1..10).
++  inline uint32_t xoValue() const { return bits(10, 1); }
++
++  // Extended opcode for XL-form (bits 1..10).
++  inline uint32_t xlValue() const { return bits(10, 1); }
++
++  // MD-form SH field: sh[0:4] in instruction bits 15:11, sh[5] in bit 1.
++  // Assembler encodes: ((sh & 0x1f) << 11) | ((sh & 0x20) >> 4).
++  inline uint32_t mdSHValue() const {
++    return bits(15, 11) | (bit(1) << 5);
++  }
++  // mb/me for MD-form (rldicl/rldicr/rldic/rldimi): 6-bit field split as
++  // mb[0:4] in instruction bits 10:6 and mb[5] in bit 5.
++  inline uint32_t mdMBValue() const {
++    return bits(10, 6) | (bit(5) << 5);
++  }
++  inline uint32_t mdMEValue() const { return mdMBValue(); }
++
++  // MD-form XO (bits 2..4).
++  inline uint32_t mdXOValue() const { return bits(4, 2); }
++
++  // MDS-form (rldcl, rldcr): mb[0:4] in bits 10:6, mb[5] in bit 5.
++  inline uint32_t mdsMBValue() const {
++    return bits(10, 6) | (bit(5) << 5);
++  }
++
++  // M-form fields (32-bit rotate/mask).
++  inline uint32_t mSHValue() const { return bits(15, 11); }
++  inline uint32_t mMBValue() const { return bits(10, 6); }
++  inline uint32_t mMEValue() const { return bits(5, 1); }
++
++  // Rc bit.
++  inline bool rcBit() const { return bit(0); }
++
++  // AA bit for branch instructions.
++  inline bool aaBit() const { return bit(1); }
++
++  // LK bit for branch instructions.
++  inline bool lkBit() const { return bit(0); }
++
++  // OE bit for XO-form arithmetic.
++  inline bool oeBit() const { return bit(10); }
++
++  // L bit for compare instructions (bit 21).
++  inline bool lBit() const { return bit(21); }
++
++  // BF field (bits 23..25) for compares.
++  inline uint32_t bfValue() const { return bits(25, 23); }
++
++  bool isTrap() const {
++    uint32_t instr = instructionBits();
++    // PPC_trap = 0x7FE00008 (tw 31,0,0).
++    // Don't treat the call-redirection instruction or wasm trap as a
++    // debugger trap.
++    if (instr == kCallRedirInstr) return false;
++    if (instr == 0x7FE00008) return false;
++    // Any other tw instruction with TO=31 is a trap.
++    if (opcode() == 31 && (xoValue() == 4)) return true;
++    return false;
++  }
++
++ private:
++  SimInstruction() = delete;
++  SimInstruction(const SimInstruction& other) = delete;
++  void operator=(const SimInstruction& other) = delete;
++};
++
++// -----------------------------------------------------------------------------
++// ICache.
++
++class CachePage {
++ public:
++  static const int LINE_VALID = 0;
++  static const int LINE_INVALID = 1;
++
++  static const int kPageShift = 12;
++  static const int kPageSize = 1 << kPageShift;
++  static const int kPageMask = kPageSize - 1;
++  static const int kLineShift = 2;
++  static const int kLineLength = 1 << kLineShift;
++  static const int kLineMask = kLineLength - 1;
++
++  CachePage() { memset(&validity_map_, LINE_INVALID, sizeof(validity_map_)); }
++
++  char* validityByte(int offset) {
++    return &validity_map_[offset >> kLineShift];
++  }
++
++  char* cachedData(int offset) { return &data_[offset]; }
++
++ private:
++  char data_[kPageSize];
++  static const int kValidityMapSize = kPageSize >> kLineShift;
++  char validity_map_[kValidityMapSize];
++};
++
++class AutoLockSimulatorCache : public LockGuard<Mutex> {
++  using Base = LockGuard<Mutex>;
++
++ public:
++  explicit AutoLockSimulatorCache()
++      : Base(SimulatorProcess::singleton_->cacheLock_) {}
++};
++
++mozilla::Atomic<size_t, mozilla::ReleaseAcquire>
++    SimulatorProcess::ICacheCheckingDisableCount(1);
++SimulatorProcess* SimulatorProcess::singleton_ = nullptr;
++
++int64_t Simulator::StopSimAt = -1;
++
++// -----------------------------------------------------------------------------
++// Simulator Create / Destroy.
++
++Simulator* Simulator::Create() {
++  auto sim = MakeUnique<Simulator>();
++  if (!sim) {
++    return nullptr;
++  }
++
++  if (!sim->init()) {
++    return nullptr;
++  }
++
++  int64_t stopAt;
++  char* stopAtStr = getenv("PPC64_SIM_STOP_AT");
++  if (stopAtStr && sscanf(stopAtStr, "%" PRIi64, &stopAt) == 1) {
++    fprintf(stderr, "\nStopping simulation at icount %" PRIi64 "\n", stopAt);
++    Simulator::StopSimAt = stopAt;
++  }
++
++  return sim.release();
++}
++
++void Simulator::Destroy(Simulator* sim) { js_delete(sim); }
++
++// -----------------------------------------------------------------------------
++// Debugger.
++
++class ppc64Debugger {
++ public:
++  explicit ppc64Debugger(Simulator* sim) : sim_(sim) {}
++
++  void stop(SimInstruction* instr);
++  void debug();
++  void printAllRegs();
++  void printAllRegsIncludingFPU();
++
++ private:
++  static const Instr kBreakpointInstr = 0x7FE00008;  // PPC_trap
++  static const Instr kNopInstr = 0x60000000;          // PPC_nop
++
++  Simulator* sim_;
++
++  int64_t getRegisterValue(int regnum);
++  int64_t getFPURegisterValueLong(int regnum);
++  float getFPURegisterValueFloat(int regnum);
++  double getFPURegisterValueDouble(int regnum);
++  bool getValue(const char* desc, int64_t* value);
++
++  bool setBreakpoint(SimInstruction* breakpc);
++  bool deleteBreakpoint(SimInstruction* breakpc);
++
++  void undoBreakpoints();
++  void redoBreakpoints();
++};
++
++[[maybe_unused]] static void UNIMPLEMENTED() {
++  printf("UNIMPLEMENTED instruction.\n");
++  MOZ_CRASH();
++}
++[[maybe_unused]] static void UNREACHABLE() {
++  printf("UNREACHABLE instruction.\n");
++  MOZ_CRASH();
++}
++[[maybe_unused]] static void UNSUPPORTED() {
++  printf("Unsupported instruction.\n");
++  MOZ_CRASH();
++}
++
++void ppc64Debugger::stop(SimInstruction* instr) {
++  uint32_t code = 0;
++  char* msg = *reinterpret_cast<char**>(sim_->get_pc() +
++                                        SimInstruction::kInstrSize);
++  if (!sim_->watchedStops_[code].desc_) {
++    sim_->watchedStops_[code].desc_ = msg;
++  }
++  if (code != kMaxStopCode) {
++    printf("Simulator hit stop %u: %s\n", code, msg);
++  } else {
++    printf("Simulator hit %s\n", msg);
++  }
++  sim_->set_pc(sim_->get_pc() + 2 * SimInstruction::kInstrSize);
++  debug();
++}
++
++int64_t ppc64Debugger::getRegisterValue(int regnum) {
++  if (regnum == kPCRegister) {
++    return sim_->get_pc();
++  }
++  return sim_->getRegister(regnum);
++}
++
++int64_t ppc64Debugger::getFPURegisterValueLong(int regnum) {
++  return sim_->getFpuRegister(regnum);
++}
++
++float ppc64Debugger::getFPURegisterValueFloat(int regnum) {
++  return sim_->getFpuRegisterFloat(regnum);
++}
++
++double ppc64Debugger::getFPURegisterValueDouble(int regnum) {
++  return sim_->getFpuRegisterDouble(regnum);
++}
++
++bool ppc64Debugger::getValue(const char* desc, int64_t* value) {
++  Register reg = Register::FromName(desc);
++  if (reg != InvalidReg) {
++    *value = getRegisterValue(reg.code());
++    return true;
++  }
++
++  if (strncmp(desc, "0x", 2) == 0) {
++    return sscanf(desc + 2, "%" PRIx64, reinterpret_cast<uint64_t*>(value)) ==
++           1;
++  }
++  return sscanf(desc, "%" PRIu64, reinterpret_cast<uint64_t*>(value)) == 1;
++}
++
++bool ppc64Debugger::setBreakpoint(SimInstruction* breakpc) {
++  if (sim_->break_pc_ != nullptr) {
++    return false;
++  }
++
++  sim_->break_pc_ = breakpc;
++  sim_->break_instr_ = breakpc->instructionBits();
++  return true;
++}
++
++bool ppc64Debugger::deleteBreakpoint(SimInstruction* breakpc) {
++  if (sim_->break_pc_ != nullptr) {
++    sim_->break_pc_->setInstructionBits(sim_->break_instr_);
++  }
++
++  sim_->break_pc_ = nullptr;
++  sim_->break_instr_ = 0;
++  return true;
++}
++
++void ppc64Debugger::undoBreakpoints() {
++  if (sim_->break_pc_) {
++    sim_->break_pc_->setInstructionBits(sim_->break_instr_);
++  }
++}
++
++void ppc64Debugger::redoBreakpoints() {
++  if (sim_->break_pc_) {
++    sim_->break_pc_->setInstructionBits(kBreakpointInstr);
++  }
++}
++
++void ppc64Debugger::printAllRegs() {
++  int64_t value;
++  for (uint32_t i = 0; i < Registers::Total; i++) {
++    value = getRegisterValue(i);
++    printf("%3s: 0x%016" PRIx64 " %20" PRIi64 "   ", Registers::GetName(i),
++           value, value);
++
++    if (i % 2) {
++      printf("\n");
++    }
++  }
++  printf("\n");
++
++  value = getRegisterValue(Simulator::pc);
++  printf("  pc: 0x%016" PRIx64 "\n", value);
++  printf("  lr: 0x%016" PRIx64 "\n", sim_->getLR());
++  printf(" ctr: 0x%016" PRIx64 "\n", sim_->getCTR());
++  printf("  cr: 0x%08x\n", sim_->getCR());
++  printf(" xer: 0x%016" PRIx64 "\n", sim_->getXER());
++}
++
++void ppc64Debugger::printAllRegsIncludingFPU() {
++  printAllRegs();
++
++  printf("\n\n");
++  for (uint32_t i = 0; i < FloatRegisters::TotalPhys; i++) {
++    printf("%3s: 0x%016" PRIx64 "\tflt: %-8.4g\tdbl: %-16.4g\n",
++           FloatRegisters::GetName(i), getFPURegisterValueLong(i),
++           getFPURegisterValueFloat(i), getFPURegisterValueDouble(i));
++  }
++}
++
++static char* ReadLine(const char* prompt) {
++  UniqueChars result;
++  char lineBuf[256];
++  int offset = 0;
++  bool keepGoing = true;
++  fprintf(stdout, "%s", prompt);
++  fflush(stdout);
++  while (keepGoing) {
++    if (fgets(lineBuf, sizeof(lineBuf), stdin) == nullptr) {
++      return nullptr;
++    }
++    int len = strlen(lineBuf);
++    if (len > 0 && lineBuf[len - 1] == '\n') {
++      keepGoing = false;
++    }
++    if (!result) {
++      result.reset(js_pod_malloc<char>(len + 1));
++      if (!result) {
++        return nullptr;
++      }
++    } else {
++      int new_len = offset + len + 1;
++      char* new_result = js_pod_malloc<char>(new_len);
++      if (!new_result) {
++        return nullptr;
++      }
++      memcpy(new_result, result.get(), offset * sizeof(char));
++      result.reset(new_result);
++    }
++    memcpy(result.get() + offset, lineBuf, len * sizeof(char));
++    offset += len;
++  }
++
++  MOZ_ASSERT(result);
++  result[offset] = '\0';
++  return result.release();
++}
++
++static void DisassembleInstruction(uint64_t pc) {
++  printf("  0x%016" PRIx64 ":  %08x\n", pc,
++         *reinterpret_cast<uint32_t*>(pc));
++}
++
++void ppc64Debugger::debug() {
++  intptr_t lastPC = -1;
++  bool done = false;
++
++#define COMMAND_SIZE 63
++#define ARG_SIZE 255
++
++#define STR(a) #a
++#define XSTR(a) STR(a)
++
++  char cmd[COMMAND_SIZE + 1];
++  char arg1[ARG_SIZE + 1];
++  char arg2[ARG_SIZE + 1];
++  char* argv[3] = {cmd, arg1, arg2};
++
++  cmd[COMMAND_SIZE] = 0;
++  arg1[ARG_SIZE] = 0;
++  arg2[ARG_SIZE] = 0;
++
++  undoBreakpoints();
++
++  while (!done && (sim_->get_pc() != Simulator::end_sim_pc)) {
++    if (lastPC != sim_->get_pc()) {
++      DisassembleInstruction(sim_->get_pc());
++      lastPC = sim_->get_pc();
++    }
++    char* line = ReadLine("sim> ");
++    if (line == nullptr) {
++      break;
++    } else {
++      char* last_input = sim_->lastDebuggerInput();
++      if (strcmp(line, "\n") == 0 && last_input != nullptr) {
++        line = last_input;
++      } else {
++        sim_->setLastDebuggerInput(line);
++      }
++      int argc = sscanf(line,
++                              "%" XSTR(COMMAND_SIZE) "s "
++                              "%" XSTR(ARG_SIZE) "s "
++                              "%" XSTR(ARG_SIZE) "s",
++                              cmd, arg1, arg2);
++      if ((strcmp(cmd, "si") == 0) || (strcmp(cmd, "stepi") == 0)) {
++        SimInstruction* instr =
++            reinterpret_cast<SimInstruction*>(sim_->get_pc());
++        if (!instr->isTrap()) {
++          sim_->instructionDecode(instr);
++        } else {
++          printf("/!\\ Jumping over generated breakpoint.\n");
++          sim_->set_pc(sim_->get_pc() + SimInstruction::kInstrSize);
++        }
++        sim_->icount_++;
++      } else if ((strcmp(cmd, "c") == 0) || (strcmp(cmd, "cont") == 0)) {
++        sim_->instructionDecode(
++            reinterpret_cast<SimInstruction*>(sim_->get_pc()));
++        sim_->icount_++;
++        done = true;
++      } else if ((strcmp(cmd, "p") == 0) || (strcmp(cmd, "print") == 0)) {
++        if (argc == 2) {
++          int64_t value;
++          if (strcmp(arg1, "all") == 0) {
++            printAllRegs();
++          } else if (strcmp(arg1, "allf") == 0) {
++            printAllRegsIncludingFPU();
++          } else {
++            Register reg = Register::FromName(arg1);
++            FloatRegisters::Code fReg = FloatRegisters::FromName(arg1);
++            if (reg != InvalidReg) {
++              value = getRegisterValue(reg.code());
++              printf("%s: 0x%016" PRIx64 " %20" PRIi64 " \n", arg1, value,
++                     value);
++            } else if (fReg != FloatRegisters::Invalid) {
++              printf("%3s: 0x%016" PRIx64 "\tflt: %-8.4g\tdbl: %-16.4g\n",
++                     FloatRegisters::GetName(fReg),
++                     getFPURegisterValueLong(fReg),
++                     getFPURegisterValueFloat(fReg),
++                     getFPURegisterValueDouble(fReg));
++            } else {
++              printf("%s unrecognized\n", arg1);
++            }
++          }
++        } else {
++          printf("print <register> or print <fpu register> single\n");
++        }
++      } else if (strcmp(cmd, "stack") == 0 || strcmp(cmd, "mem") == 0) {
++        int64_t* cur = nullptr;
++        int64_t* end = nullptr;
++        int next_arg = 1;
++
++        if (strcmp(cmd, "stack") == 0) {
++          cur = reinterpret_cast<int64_t*>(sim_->getRegister(Simulator::sp));
++        } else {
++          int64_t value;
++          if (!getValue(arg1, &value)) {
++            printf("%s unrecognized\n", arg1);
++            continue;
++          }
++          cur = reinterpret_cast<int64_t*>(value);
++          next_arg++;
++        }
++
++        int64_t words;
++        if (argc == next_arg) {
++          words = 10;
++        } else {
++          if (!getValue(argv[next_arg], &words)) {
++            words = 10;
++          }
++        }
++        end = cur + words;
++
++        while (cur < end) {
++          printf("  %p:  0x%016" PRIx64 " %20" PRIi64, cur, *cur, *cur);
++          printf("\n");
++          cur++;
++        }
++
++      } else if ((strcmp(cmd, "disasm") == 0) || (strcmp(cmd, "dpc") == 0) ||
++                 (strcmp(cmd, "di") == 0)) {
++        uint8_t* cur = nullptr;
++        uint8_t* end = nullptr;
++
++        if (argc == 1) {
++          cur = reinterpret_cast<uint8_t*>(sim_->get_pc());
++          end = cur + (10 * SimInstruction::kInstrSize);
++        } else if (argc == 2) {
++          Register reg = Register::FromName(arg1);
++          if (reg != InvalidReg || strncmp(arg1, "0x", 2) == 0) {
++            int64_t value;
++            if (getValue(arg1, &value)) {
++              cur = reinterpret_cast<uint8_t*>(value);
++              end = cur + (10 * SimInstruction::kInstrSize);
++            }
++          } else {
++            int64_t value;
++            if (getValue(arg1, &value)) {
++              cur = reinterpret_cast<uint8_t*>(sim_->get_pc());
++              end = cur + (value * SimInstruction::kInstrSize);
++            }
++          }
++        } else {
++          int64_t value1;
++          int64_t value2;
++          if (getValue(arg1, &value1) && getValue(arg2, &value2)) {
++            cur = reinterpret_cast<uint8_t*>(value1);
++            end = cur + (value2 * SimInstruction::kInstrSize);
++          }
++        }
++
++        while (cur < end) {
++          DisassembleInstruction(uint64_t(cur));
++          cur += SimInstruction::kInstrSize;
++        }
++      } else if (strcmp(cmd, "gdb") == 0) {
++        printf("relinquishing control to gdb\n");
++#if defined(__x86_64__)
++        asm("int $3");
++#elif defined(__aarch64__)
++        asm("brk #0xf000");
++#endif
++        printf("regaining control from gdb\n");
++      } else if (strcmp(cmd, "break") == 0) {
++        if (argc == 2) {
++          int64_t value;
++          if (getValue(arg1, &value)) {
++            if (!setBreakpoint(reinterpret_cast<SimInstruction*>(value))) {
++              printf("setting breakpoint failed\n");
++            }
++          } else {
++            printf("%s unrecognized\n", arg1);
++          }
++        } else {
++          printf("break <address>\n");
++        }
++      } else if (strcmp(cmd, "del") == 0) {
++        if (!deleteBreakpoint(nullptr)) {
++          printf("deleting breakpoint failed\n");
++        }
++      } else if (strcmp(cmd, "flags") == 0) {
++        printf("CR: 0x%08x   XER: 0x%016" PRIx64 "\n", sim_->getCR(),
++               sim_->getXER());
++      } else if (strcmp(cmd, "stop") == 0) {
++        int64_t value;
++        intptr_t stop_pc = sim_->get_pc() - 2 * SimInstruction::kInstrSize;
++        SimInstruction* stop_instr =
++            reinterpret_cast<SimInstruction*>(stop_pc);
++        SimInstruction* msg_address = reinterpret_cast<SimInstruction*>(
++            stop_pc + SimInstruction::kInstrSize);
++        if ((argc == 2) && (strcmp(arg1, "unstop") == 0)) {
++          if (sim_->isStopInstruction(stop_instr)) {
++            stop_instr->setInstructionBits(kNopInstr);
++            msg_address->setInstructionBits(kNopInstr);
++          } else {
++            printf("Not at debugger stop.\n");
++          }
++        } else if (argc == 3) {
++          if (strcmp(arg1, "info") == 0) {
++            if (strcmp(arg2, "all") == 0) {
++              printf("Stop information:\n");
++              for (uint32_t i = kMaxWatchpointCode + 1; i <= kMaxStopCode;
++                   i++) {
++                sim_->printStopInfo(i);
++              }
++            } else if (getValue(arg2, &value)) {
++              sim_->printStopInfo(value);
++            } else {
++              printf("Unrecognized argument.\n");
++            }
++          } else if (strcmp(arg1, "enable") == 0) {
++            if (strcmp(arg2, "all") == 0) {
++              for (uint32_t i = kMaxWatchpointCode + 1; i <= kMaxStopCode;
++                   i++) {
++                sim_->enableStop(i);
++              }
++            } else if (getValue(arg2, &value)) {
++              sim_->enableStop(value);
++            } else {
++              printf("Unrecognized argument.\n");
++            }
++          } else if (strcmp(arg1, "disable") == 0) {
++            if (strcmp(arg2, "all") == 0) {
++              for (uint32_t i = kMaxWatchpointCode + 1; i <= kMaxStopCode;
++                   i++) {
++                sim_->disableStop(i);
++              }
++            } else if (getValue(arg2, &value)) {
++              sim_->disableStop(value);
++            } else {
++              printf("Unrecognized argument.\n");
++            }
++          }
++        } else {
++          printf("Wrong usage. Use help command for more information.\n");
++        }
++      } else if ((strcmp(cmd, "h") == 0) || (strcmp(cmd, "help") == 0)) {
++        printf("cont\n");
++        printf("  continue execution (alias 'c')\n");
++        printf("stepi\n");
++        printf("  step one instruction (alias 'si')\n");
++        printf("print <register>\n");
++        printf("  print register content (alias 'p')\n");
++        printf("  use register name 'all' to print all registers\n");
++        printf("stack [<words>]\n");
++        printf("  dump stack content, default dump 10 words)\n");
++        printf("mem <address> [<words>]\n");
++        printf("  dump memory content, default dump 10 words)\n");
++        printf("flags\n");
++        printf("  print CR and XER\n");
++        printf("disasm [<instructions>]\n");
++        printf("disasm [<address/register>]\n");
++        printf("disasm [[<address/register>] <instructions>]\n");
++        printf("  disassemble code, default is 10 instructions\n");
++        printf("  from pc (alias 'di')\n");
++        printf("gdb\n");
++        printf("  enter gdb\n");
++        printf("break <address>\n");
++        printf("  set a break point on the address\n");
++        printf("del\n");
++        printf("  delete the breakpoint\n");
++      } else {
++        printf("Unknown command: %s\n", cmd);
++      }
++    }
++  }
++
++  redoBreakpoints();
++
++#undef COMMAND_SIZE
++#undef ARG_SIZE
++
++#undef STR
++#undef XSTR
++}
++
++// -----------------------------------------------------------------------------
++// ICache helpers.
++
++static bool AllOnOnePage(uintptr_t start, int size) {
++  intptr_t start_page = (start & ~CachePage::kPageMask);
++  intptr_t end_page = ((start + size) & ~CachePage::kPageMask);
++  return start_page == end_page;
++}
++
++void Simulator::setLastDebuggerInput(char* input) {
++  js_free(lastDebuggerInput_);
++  lastDebuggerInput_ = input;
++}
++
++static CachePage* GetCachePageLocked(SimulatorProcess::ICacheMap& i_cache,
++                                     void* page) {
++  SimulatorProcess::ICacheMap::AddPtr p = i_cache.lookupForAdd(page);
++  if (p) {
++    return p->value();
++  }
++  AutoEnterOOMUnsafeRegion oomUnsafe;
++  CachePage* new_page = js_new<CachePage>();
++  if (!new_page || !i_cache.add(p, page, new_page)) {
++    oomUnsafe.crash("Simulator CachePage");
++  }
++  return new_page;
++}
++
++static void FlushOnePageLocked(SimulatorProcess::ICacheMap& i_cache,
++                               intptr_t start, int size) {
++  MOZ_ASSERT(size <= CachePage::kPageSize);
++  MOZ_ASSERT(AllOnOnePage(start, size - 1));
++  MOZ_ASSERT((start & CachePage::kLineMask) == 0);
++  MOZ_ASSERT((size & CachePage::kLineMask) == 0);
++  void* page = reinterpret_cast<void*>(start & (~CachePage::kPageMask));
++  int offset = (start & CachePage::kPageMask);
++  CachePage* cache_page = GetCachePageLocked(i_cache, page);
++  char* valid_bytemap = cache_page->validityByte(offset);
++  memset(valid_bytemap, CachePage::LINE_INVALID, size >> CachePage::kLineShift);
++}
++
++static void FlushICacheLocked(SimulatorProcess::ICacheMap& i_cache,
++                              void* start_addr, size_t size) {
++  intptr_t start = reinterpret_cast<intptr_t>(start_addr);
++  int intra_line = (start & CachePage::kLineMask);
++  start -= intra_line;
++  size += intra_line;
++  size = ((size - 1) | CachePage::kLineMask) + 1;
++  int offset = (start & CachePage::kPageMask);
++  while (!AllOnOnePage(start, size - 1)) {
++    int bytes_to_flush = CachePage::kPageSize - offset;
++    FlushOnePageLocked(i_cache, start, bytes_to_flush);
++    start += bytes_to_flush;
++    size -= bytes_to_flush;
++    MOZ_ASSERT((start & CachePage::kPageMask) == 0);
++    offset = 0;
++  }
++  if (size != 0) {
++    FlushOnePageLocked(i_cache, start, size);
++  }
++}
++
++/* static */
++void SimulatorProcess::checkICacheLocked(SimInstruction* instr) {
++  intptr_t address = reinterpret_cast<intptr_t>(instr);
++  void* page = reinterpret_cast<void*>(address & (~CachePage::kPageMask));
++  void* line = reinterpret_cast<void*>(address & (~CachePage::kLineMask));
++  int offset = (address & CachePage::kPageMask);
++  CachePage* cache_page = GetCachePageLocked(icache(), page);
++  char* cache_valid_byte = cache_page->validityByte(offset);
++  bool cache_hit = (*cache_valid_byte == CachePage::LINE_VALID);
++  char* cached_line = cache_page->cachedData(offset & ~CachePage::kLineMask);
++
++  if (cache_hit) {
++    mozilla::DebugOnly<int> cmpret =
++        memcmp(reinterpret_cast<void*>(instr), cache_page->cachedData(offset),
++               SimInstruction::kInstrSize);
++    MOZ_ASSERT(cmpret == 0);
++  } else {
++    memcpy(cached_line, line, CachePage::kLineLength);
++    *cache_valid_byte = CachePage::LINE_VALID;
++  }
++}
++
++HashNumber SimulatorProcess::ICacheHasher::hash(const Lookup& l) {
++  return U32(reinterpret_cast<uintptr_t>(l)) >> 2;
++}
++
++bool SimulatorProcess::ICacheHasher::match(const Key& k, const Lookup& l) {
++  MOZ_ASSERT((reinterpret_cast<intptr_t>(k) & CachePage::kPageMask) == 0);
++  MOZ_ASSERT((reinterpret_cast<intptr_t>(l) & CachePage::kPageMask) == 0);
++  return k == l;
++}
++
++/* static */
++void SimulatorProcess::FlushICache(void* start_addr, size_t size) {
++  if (!ICacheCheckingDisableCount) {
++    AutoLockSimulatorCache als;
++    js::jit::FlushICacheLocked(icache(), start_addr, size);
++  }
++}
++
++// -----------------------------------------------------------------------------
++// Redirection.
++
++class Redirection {
++  friend class SimulatorProcess;
++
++  Redirection(void* nativeFunction, ABIFunctionType type)
++      : nativeFunction_(nativeFunction),
++        swiInstruction_(kCallRedirInstr),
++        type_(type),
++        next_(nullptr) {
++    next_ = SimulatorProcess::redirection();
++    if (!SimulatorProcess::ICacheCheckingDisableCount) {
++      FlushICacheLocked(SimulatorProcess::icache(), addressOfSwiInstruction(),
++                        SimInstruction::kInstrSize);
++    }
++    SimulatorProcess::setRedirection(this);
++  }
++
++ public:
++  void* addressOfSwiInstruction() { return &swiInstruction_; }
++  void* nativeFunction() const { return nativeFunction_; }
++  ABIFunctionType type() const { return type_; }
++
++  static Redirection* Get(void* nativeFunction, ABIFunctionType type) {
++    AutoLockSimulatorCache als;
++
++    Redirection* current = SimulatorProcess::redirection();
++    for (; current != nullptr; current = current->next_) {
++      if (current->nativeFunction_ == nativeFunction) {
++        MOZ_ASSERT(current->type() == type);
++        return current;
++      }
++    }
++
++    AutoEnterOOMUnsafeRegion oomUnsafe;
++    Redirection* redir = js_pod_malloc<Redirection>(1);
++    if (!redir) {
++      oomUnsafe.crash("Simulator redirection");
++    }
++    new (redir) Redirection(nativeFunction, type);
++    return redir;
++  }
++
++  static Redirection* FromSwiInstruction(SimInstruction* swiInstruction) {
++    uint8_t* addrOfSwi = reinterpret_cast<uint8_t*>(swiInstruction);
++    uint8_t* addrOfRedirection =
++        addrOfSwi - offsetof(Redirection, swiInstruction_);
++    return reinterpret_cast<Redirection*>(addrOfRedirection);
++  }
++
++ private:
++  void* nativeFunction_;
++  uint32_t swiInstruction_;
++  ABIFunctionType type_;
++  Redirection* next_;
++};
++
++// -----------------------------------------------------------------------------
++// Simulator constructor / destructor / init.
++
++Simulator::Simulator() {
++  stack_ = nullptr;
++  stackLimit_ = 0;
++  pc_modified_ = false;
++  icount_ = 0;
++  break_count_ = 0;
++  break_pc_ = nullptr;
++  break_instr_ = 0;
++  single_stepping_ = false;
++  single_step_callback_ = nullptr;
++  single_step_callback_arg_ = nullptr;
++
++  for (int i = 0; i < Register::kNumSimuRegisters; i++) {
++    registers_[i] = 0;
++  }
++  for (int i = 0; i < Simulator::FPURegister::kNumFPURegisters; i++) {
++    FPUregisters_[i] = 0;
++  }
++
++  LR_ = 0;
++  CTR_ = 0;
++  CR_ = 0;
++  XER_ = 0;
++  FPSCR_ = 0;
++  LLBit_ = false;
++  LLAddr_ = 0;
++  lastLLValue_ = 0;
++
++  // Initialize PC and LR to a known bad value that will cause an
++  // access violation if the simulator ever tries to execute it.
++  registers_[pc] = bad_ra;
++  LR_ = bad_ra;
++
++  lastDebuggerInput_ = nullptr;
++}
++
++bool Simulator::init() {
++  static const size_t stackSize = 2 * 1024 * 1024;
++  stack_ = js_pod_malloc<char>(stackSize);
++  if (!stack_) {
++    return false;
++  }
++
++  // Leave a safety margin of 1MB to prevent overrunning the stack.
++  stackLimit_ = reinterpret_cast<uintptr_t>(stack_) + 1024 * 1024;
++
++  // The sp is initialized to point to the bottom (high address) of the
++  // allocated stack area.
++  registers_[sp] = reinterpret_cast<int64_t>(stack_) + stackSize - 64;
++
++  // Zero-initialize VR namespace. Simulated PPC64 does not guarantee any
++  // value in VRs at entry, but zeroing avoids uninitialized-read false
++  // positives in tools and makes regression traces deterministic.
++  memset(VRregisters_, 0, sizeof(VRregisters_));
++
++  return true;
++}
++
++Simulator::~Simulator() { js_free(stack_); }
++
++SimulatorProcess::SimulatorProcess()
++    : cacheLock_(mutexid::SimulatorCacheLock), redirection_(nullptr) {
++  if (getenv("PPC64_SIM_ICACHE_CHECKS")) {
++    ICacheCheckingDisableCount = 0;
++  }
++}
++
++SimulatorProcess::~SimulatorProcess() {
++  Redirection* r = redirection_;
++  while (r) {
++    Redirection* next = r->next_;
++    js_delete(r);
++    r = next;
++  }
++}
++
++/* static */
++void* Simulator::RedirectNativeFunction(void* nativeFunction,
++                                        ABIFunctionType type) {
++  Redirection* redirection = Redirection::Get(nativeFunction, type);
++  return redirection->addressOfSwiInstruction();
++}
++
++Simulator* Simulator::Current() {
++  JSContext* cx = TlsContext.get();
++  MOZ_ASSERT(CurrentThreadCanAccessRuntime(cx->runtime()));
++  return cx->simulator();
++}
++
++// -----------------------------------------------------------------------------
++// Register accessors.
++
++void Simulator::setRegister(int reg, int64_t value) {
++  MOZ_ASSERT((reg >= 0) && (reg < Register::kNumSimuRegisters));
++  if (reg == pc) {
++    pc_modified_ = true;
++  }
++  registers_[reg] = value;
++}
++
++int64_t Simulator::getRegister(int reg) const {
++  MOZ_ASSERT((reg >= 0) && (reg < Register::kNumSimuRegisters));
++  return registers_[reg] + ((reg == pc) ? SimInstruction::kPCReadOffset : 0);
++}
++
++void Simulator::setFpuRegister(int fpureg, int64_t value) {
++  MOZ_ASSERT((fpureg >= 0) &&
++             (fpureg < Simulator::FPURegister::kNumFPURegisters));
++  FPUregisters_[fpureg] = value;
++}
++
++void Simulator::setFpuRegisterWord(int fpureg, int32_t value) {
++  MOZ_ASSERT((fpureg >= 0) &&
++             (fpureg < Simulator::FPURegister::kNumFPURegisters));
++  int32_t* pword;
++  pword = reinterpret_cast<int32_t*>(&FPUregisters_[fpureg]);
++  *pword = value;
++}
++
++// Promote f32 → f64 preserving NaN payload, like PPC64's `lfs` and
++// `xscvspdpn`. The plain C cast `(double)f32_nan` is permitted by the
++// standard to quiet a signaling NaN, which on x86/ARM hosts visibly
++// transforms 0x7FA00000 (sNaN) into a qNaN such as 0x7FE00000 — breaking
++// every wasm test that loads a constant sNaN bit pattern. Manually
++// reconstruct the f64 NaN with the same sign + payload (payload shifted
++// left by 29 to fill the wider mantissa).
++static double promoteFloatPreservingNaN(float f) {
++  uint32_t bits;
++  memcpy(&bits, &f, sizeof(bits));
++  if ((bits & 0x7F800000u) == 0x7F800000u && (bits & 0x007FFFFFu) != 0u) {
++    uint64_t sign = uint64_t(bits >> 31) & 1u;
++    uint64_t payload = uint64_t(bits & 0x007FFFFFu);
++    uint64_t dbits = (sign << 63) | (uint64_t(0x7FFu) << 52) | (payload << 29);
++    double d;
++    memcpy(&d, &dbits, sizeof(d));
++    return d;
++  }
++  return (double)f;
++}
++
++// Demote f64 → f32 preserving NaN payload (non-signaling: matches PPC64
++// `stfs` / `xscvdpspn`, and wasm `lfs`-equivalent stores). Truncates the
++// lower 29 bits of the f64 payload (those bits cannot be represented in
++// the narrower f32 mantissa); if the truncation would yield a payload of
++// zero (which would degrade the NaN to an Infinity), force the LSB so
++// the result is still a NaN. This intentionally does NOT set the quiet
++// bit — that's the job of the explicit-quieting op `xscvdpsp` and
++// f32.demote_f64's wasm-level lowering.
++static float demoteDoublePreservingNaN(double d) {
++  uint64_t bits;
++  memcpy(&bits, &d, sizeof(bits));
++  if ((bits & 0x7FF0000000000000ULL) == 0x7FF0000000000000ULL &&
++      (bits & 0x000FFFFFFFFFFFFFULL) != 0) {
++    uint32_t sign = uint32_t(bits >> 63) & 1u;
++    uint32_t payload = uint32_t((bits >> 29) & 0x007FFFFFu);
++    if (payload == 0) payload = 1;
++    uint32_t fbits = (sign << 31) | 0x7F800000u | payload;
++    float f;
++    memcpy(&f, &fbits, sizeof(f));
++    return f;
++  }
++  return (float)d;
++}
++
++void Simulator::setFpuRegisterFloat(int fpureg, float value) {
++  MOZ_ASSERT((fpureg >= 0) &&
++             (fpureg < Simulator::FPURegister::kNumFPURegisters));
++  // ELFv2 ABI: single-precision values in FPRs are stored as their
++  // double-precision representation. Promote and store the full 8 bytes,
++  // not just the low 4. (Otherwise the upper 4 bytes are stale, matching
++  // the layout that fctid/fcfid/lfd would read but NOT what the JIT and
++  // the C ABI expect for a 'float' parameter.) Use the NaN-preserving
++  // helper so a signaling-NaN return value isn't quieted into a qNaN.
++  double promoted = promoteFloatPreservingNaN(value);
++  memcpy(&FPUregisters_[fpureg], &promoted, sizeof(promoted));
++}
++
++void Simulator::setFpuRegisterDouble(int fpureg, double value) {
++  MOZ_ASSERT((fpureg >= 0) &&
++             (fpureg < Simulator::FPURegister::kNumFPURegisters));
++  *mozilla::BitwiseCast<double*>(&FPUregisters_[fpureg]) = value;
++}
++
++int64_t Simulator::getFpuRegister(int fpureg) const {
++  MOZ_ASSERT((fpureg >= 0) &&
++             (fpureg < Simulator::FPURegister::kNumFPURegisters));
++  return FPUregisters_[fpureg];
++}
++
++int32_t Simulator::getFpuRegisterWord(int fpureg) const {
++  MOZ_ASSERT((fpureg >= 0) &&
++             (fpureg < Simulator::FPURegister::kNumFPURegisters));
++  return *mozilla::BitwiseCast<int32_t*>(&FPUregisters_[fpureg]);
++}
++
++int32_t Simulator::getFpuRegisterSignedWord(int fpureg) const {
++  MOZ_ASSERT((fpureg >= 0) &&
++             (fpureg < Simulator::FPURegister::kNumFPURegisters));
++  return *mozilla::BitwiseCast<int32_t*>(&FPUregisters_[fpureg]);
++}
++
++float Simulator::getFpuRegisterFloat(int fpureg) const {
++  MOZ_ASSERT((fpureg >= 0) &&
++             (fpureg < Simulator::FPURegister::kNumFPURegisters));
++  // ELFv2 ABI: single-precision values are passed/returned in FPRs as their
++  // double-precision representation. Read the full 8 bytes as double, then
++  // narrow to float — matching the `frsp` the C callee would do, and matching
++  // what real PPC64 hardware sees when the FPR was loaded via `lfs`. Use the
++  // NaN-preserving helper so a signaling-NaN parameter isn't quieted.
++  double promoted;
++  memcpy(&promoted, &FPUregisters_[fpureg], sizeof(promoted));
++  return demoteDoublePreservingNaN(promoted);
++}
++
++double Simulator::getFpuRegisterDouble(int fpureg) const {
++  MOZ_ASSERT((fpureg >= 0) &&
++             (fpureg < Simulator::FPURegister::kNumFPURegisters));
++  return *mozilla::BitwiseCast<double*>(&FPUregisters_[fpureg]);
++}
++
++void Simulator::setVRBytes(int vreg, const uint8_t bytes[16]) {
++  MOZ_ASSERT((vreg >= 0) && (vreg < kNumVRRegisters));
++  memcpy(VRregisters_[vreg], bytes, 16);
++}
++
++void Simulator::getVRBytes(int vreg, uint8_t bytes[16]) const {
++  MOZ_ASSERT((vreg >= 0) && (vreg < kNumVRRegisters));
++  memcpy(bytes, VRregisters_[vreg], 16);
++}
++
++void Simulator::getVSR128(int vsr, uint8_t bytes[16]) const {
++  MOZ_ASSERT((vsr >= 0) && (vsr < kNumFPURegisters + kNumVRRegisters));
++  if (vsr < kNumFPURegisters) {
++    // VSR 0-31: FPR view. The FPR scalar lives in BE DW0 of the VSR,
++    // which on PPC64LE register storage maps to LE bytes 8-15.
++    // DW1 is undefined per ISA; we model it as zero.
++    // `lfd f0,(mem); xxlor <vr>,f0,f0; stxvx <vr>,...` writes the
++    // double's 8 bytes to the HIGH half of the 16-byte store (LE
++    // bytes 8-15).
++    int64_t val = FPUregisters_[vsr];
++    memset(bytes, 0, 8);
++    memcpy(bytes + 8, &val, 8);
++  } else {
++    memcpy(bytes, VRregisters_[vsr - kNumFPURegisters], 16);
++  }
++}
++
++void Simulator::setVSR128(int vsr, const uint8_t bytes[16]) {
++  MOZ_ASSERT((vsr >= 0) && (vsr < kNumFPURegisters + kNumVRRegisters));
++  if (vsr < kNumFPURegisters) {
++    // FPR scalar at BE DW0 = LE bytes 8-15. DW1 is architecturally
++    // discarded on VSR-to-FPR writes.
++    int64_t val;
++    memcpy(&val, bytes + 8, 8);
++    FPUregisters_[vsr] = val;
++  } else {
++    memcpy(VRregisters_[vsr - kNumFPURegisters], bytes, 16);
++  }
++}
++
++void Simulator::setCallResultDouble(double result) {
++  setFpuRegisterDouble(Simulator::f1, result);
++}
++
++void Simulator::setCallResultFloat(float result) {
++  setFpuRegisterFloat(Simulator::f1, result);
++}
++
++void Simulator::setCallResult(int64_t res) { setRegister(r3, res); }
++
++#ifdef XP_DARWIN
++void Simulator::setCallResult(intptr_t res) {
++  setRegister(r3, I64(res));
++}
++#endif
++
++void Simulator::setCallResult(__int128 res) {
++  setRegister(r3, I64(res));
++  setRegister(r4, I64(res >> 64));
++}
++
++void Simulator::set_pc(int64_t value) {
++  pc_modified_ = true;
++  registers_[pc] = value;
++}
++
++bool Simulator::has_bad_pc() const {
++  return ((registers_[pc] == bad_ra) || (registers_[pc] == end_sim_pc));
++}
++
++int64_t Simulator::get_pc() const { return registers_[pc]; }
++
++JS::ProfilingFrameIterator::RegisterState Simulator::registerState() {
++  wasm::RegisterState state;
++  state.pc = (void*)get_pc();
++  state.fp = (void*)getRegister(fp);
++  state.sp = (void*)getRegister(sp);
++  state.lr = (void*)getLR();
++  return state;
++}
++
++// -----------------------------------------------------------------------------
++// Memory access helpers.
++
++uint8_t Simulator::readBU(uint64_t addr) {
++  if (handleWasmSegFault(addr, 1)) {
++    return 0xff;
++  }
++  uint8_t* ptr = reinterpret_cast<uint8_t*>(addr);
++  return *ptr;
++}
++
++int8_t Simulator::readB(uint64_t addr) {
++  if (handleWasmSegFault(addr, 1)) {
++    return -1;
++  }
++  int8_t* ptr = reinterpret_cast<int8_t*>(addr);
++  return *ptr;
++}
++
++void Simulator::writeB(uint64_t addr, uint8_t value) {
++  if (handleWasmSegFault(addr, 1)) {
++    return;
++  }
++  uint8_t* ptr = reinterpret_cast<uint8_t*>(addr);
++  *ptr = value;
++}
++
++void Simulator::writeB(uint64_t addr, int8_t value) {
++  if (handleWasmSegFault(addr, 1)) {
++    return;
++  }
++  int8_t* ptr = reinterpret_cast<int8_t*>(addr);
++  *ptr = value;
++}
++
++uint16_t Simulator::readHU(uint64_t addr, SimInstruction* instr) {
++  if (handleWasmSegFault(addr, 2)) {
++    return 0xffff;
++  }
++  uint16_t* ptr = reinterpret_cast<uint16_t*>(addr);
++  return *ptr;
++}
++
++int16_t Simulator::readH(uint64_t addr, SimInstruction* instr) {
++  if (handleWasmSegFault(addr, 2)) {
++    return -1;
++  }
++  int16_t* ptr = reinterpret_cast<int16_t*>(addr);
++  return *ptr;
++}
++
++void Simulator::writeH(uint64_t addr, uint16_t value, SimInstruction* instr) {
++  if (handleWasmSegFault(addr, 2)) {
++    return;
++  }
++  uint16_t* ptr = reinterpret_cast<uint16_t*>(addr);
++  LLBit_ = false;
++  *ptr = value;
++}
++
++void Simulator::writeH(uint64_t addr, int16_t value, SimInstruction* instr) {
++  if (handleWasmSegFault(addr, 2)) {
++    return;
++  }
++  int16_t* ptr = reinterpret_cast<int16_t*>(addr);
++  LLBit_ = false;
++  *ptr = value;
++}
++
++uint32_t Simulator::readWU(uint64_t addr, SimInstruction* instr) {
++  if (handleWasmSegFault(addr, 4)) {
++    return -1;
++  }
++  uint32_t* ptr = reinterpret_cast<uint32_t*>(addr);
++  return *ptr;
++}
++
++int32_t Simulator::readW(uint64_t addr, SimInstruction* instr) {
++  if (handleWasmSegFault(addr, 4)) {
++    return -1;
++  }
++  int32_t* ptr = reinterpret_cast<int32_t*>(addr);
++  return *ptr;
++}
++
++void Simulator::writeW(uint64_t addr, uint32_t value, SimInstruction* instr) {
++  if (handleWasmSegFault(addr, 4)) {
++    return;
++  }
++  uint32_t* ptr = reinterpret_cast<uint32_t*>(addr);
++  LLBit_ = false;
++  *ptr = value;
++}
++
++void Simulator::writeW(uint64_t addr, int32_t value, SimInstruction* instr) {
++  if (handleWasmSegFault(addr, 4)) {
++    return;
++  }
++  int32_t* ptr = reinterpret_cast<int32_t*>(addr);
++  LLBit_ = false;
++  *ptr = value;
++}
++
++int64_t Simulator::readDW(uint64_t addr, SimInstruction* instr) {
++  if (handleWasmSegFault(addr, 8)) {
++    return -1;
++  }
++  int64_t* ptr = reinterpret_cast<int64_t*>(addr);
++  return *ptr;
++}
++
++void Simulator::writeDW(uint64_t addr, int64_t value, SimInstruction* instr) {
++  if (handleWasmSegFault(addr, 8)) {
++    return;
++  }
++  int64_t* ptr = reinterpret_cast<int64_t*>(addr);
++  LLBit_ = false;
++  *ptr = value;
++}
++
++double Simulator::readD(uint64_t addr, SimInstruction* instr) {
++  if (handleWasmSegFault(addr, 8)) {
++    return NAN;
++  }
++  double* ptr = reinterpret_cast<double*>(addr);
++  return *ptr;
++}
++
++void Simulator::writeD(uint64_t addr, double value, SimInstruction* instr) {
++  if (handleWasmSegFault(addr, 8)) {
++    return;
++  }
++  double* ptr = reinterpret_cast<double*>(addr);
++  LLBit_ = false;
++  *ptr = value;
++}
++
++// Byte-wide load-reserve / store-conditional (lbarx / stbcx.).
++// Byte accesses have no alignment requirement.
++uint8_t Simulator::loadLinkedB(uint64_t addr, SimInstruction* instr) {
++  if (handleWasmSegFault(addr, 1)) {
++    return 0;
++  }
++  volatile uint8_t* ptr = reinterpret_cast<volatile uint8_t*>(addr);
++  uint8_t value = *ptr;
++  lastLLValue_ = value;
++  LLAddr_ = addr;
++  LLBit_ = true;
++  return value;
++}
++
++int Simulator::storeConditionalB(uint64_t addr, uint8_t value,
++                                 SimInstruction* instr) {
++  if (addr != LLAddr_) {
++    printf("stbcx. to bad address: 0x%016" PRIx64 ", pc=0x%016" PRIxPTR
++           ", expected: 0x%016" PRIxPTR "\n",
++           addr, reinterpret_cast<intptr_t>(instr), LLAddr_);
++    MOZ_CRASH();
++  }
++  SharedMem<uint8_t*> ptr =
++      SharedMem<uint8_t*>::shared(reinterpret_cast<uint8_t*>(addr));
++  if (!LLBit_) {
++    return 0;
++  }
++  LLBit_ = false;
++  LLAddr_ = 0;
++  uint8_t expected = uint8_t(lastLLValue_);
++  uint8_t old =
++      AtomicOperations::compareExchangeSeqCst(ptr, expected, value);
++  return (old == expected) ? 1 : 0;
++}
++
++// Halfword-wide load-reserve / store-conditional (lharx / sthcx.).
++// 2-byte aligned per ISA.
++uint16_t Simulator::loadLinkedH(uint64_t addr, SimInstruction* instr) {
++  if ((addr & 1) == 0) {
++    if (handleWasmSegFault(addr, 2)) {
++      return 0;
++    }
++    volatile uint16_t* ptr = reinterpret_cast<volatile uint16_t*>(addr);
++    uint16_t value = *ptr;
++    lastLLValue_ = value;
++    LLAddr_ = addr;
++    LLBit_ = true;
++    return value;
++  }
++  printf("Unaligned lharx at 0x%016" PRIx64 ", pc=0x%016" PRIxPTR "\n", addr,
++         reinterpret_cast<intptr_t>(instr));
++  MOZ_CRASH();
++  return 0;
++}
++
++int Simulator::storeConditionalH(uint64_t addr, uint16_t value,
++                                 SimInstruction* instr) {
++  if (addr != LLAddr_) {
++    printf("sthcx. to bad address: 0x%016" PRIx64 ", pc=0x%016" PRIxPTR
++           ", expected: 0x%016" PRIxPTR "\n",
++           addr, reinterpret_cast<intptr_t>(instr), LLAddr_);
++    MOZ_CRASH();
++  }
++  if ((addr & 1) == 0) {
++    SharedMem<uint16_t*> ptr =
++        SharedMem<uint16_t*>::shared(reinterpret_cast<uint16_t*>(addr));
++    if (!LLBit_) {
++      return 0;
++    }
++    LLBit_ = false;
++    LLAddr_ = 0;
++    uint16_t expected = uint16_t(lastLLValue_);
++    uint16_t old =
++        AtomicOperations::compareExchangeSeqCst(ptr, expected, value);
++    return (old == expected) ? 1 : 0;
++  }
++  printf("Unaligned sthcx. at 0x%016" PRIx64 ", pc=0x%016" PRIxPTR "\n", addr,
++         reinterpret_cast<intptr_t>(instr));
++  MOZ_CRASH();
++  return 0;
++}
++
++int32_t Simulator::loadLinkedW(uint64_t addr, SimInstruction* instr) {
++  if ((addr & 3) == 0) {
++    if (handleWasmSegFault(addr, 4)) {
++      return -1;
++    }
++
++    volatile int32_t* ptr = reinterpret_cast<volatile int32_t*>(addr);
++    int32_t value = *ptr;
++    lastLLValue_ = value;
++    LLAddr_ = addr;
++    LLBit_ = true;
++    return value;
++  }
++  printf("Unaligned lwarx at 0x%016" PRIx64 ", pc=0x%016" PRIxPTR "\n", addr,
++         reinterpret_cast<intptr_t>(instr));
++  MOZ_CRASH();
++  return 0;
++}
++
++int Simulator::storeConditionalW(uint64_t addr, int32_t value,
++                                 SimInstruction* instr) {
++  if (addr != LLAddr_) {
++    printf("stwcx. to bad address: 0x%016" PRIx64 ", pc=0x%016" PRIxPTR
++           ", expected: 0x%016" PRIxPTR "\n",
++           addr, reinterpret_cast<intptr_t>(instr), LLAddr_);
++    MOZ_CRASH();
++  }
++
++  if ((addr & 3) == 0) {
++    SharedMem<int32_t*> ptr =
++        SharedMem<int32_t*>::shared(reinterpret_cast<int32_t*>(addr));
++
++    if (!LLBit_) {
++      return 0;
++    }
++
++    LLBit_ = false;
++    LLAddr_ = 0;
++    int32_t expected = int32_t(lastLLValue_);
++    int32_t old =
++        AtomicOperations::compareExchangeSeqCst(ptr, expected, value);
++    return (old == expected) ? 1 : 0;
++  }
++  printf("Unaligned stwcx. at 0x%016" PRIx64 ", pc=0x%016" PRIxPTR "\n", addr,
++         reinterpret_cast<intptr_t>(instr));
++  MOZ_CRASH();
++  return 0;
++}
++
++int64_t Simulator::loadLinkedD(uint64_t addr, SimInstruction* instr) {
++  if ((addr & kPointerAlignmentMask) == 0) {
++    if (handleWasmSegFault(addr, 8)) {
++      return -1;
++    }
++
++    volatile int64_t* ptr = reinterpret_cast<volatile int64_t*>(addr);
++    int64_t value = *ptr;
++    lastLLValue_ = value;
++    LLAddr_ = addr;
++    LLBit_ = true;
++    return value;
++  }
++  printf("Unaligned ldarx at 0x%016" PRIx64 ", pc=0x%016" PRIxPTR "\n", addr,
++         reinterpret_cast<intptr_t>(instr));
++  MOZ_CRASH();
++  return 0;
++}
++
++int Simulator::storeConditionalD(uint64_t addr, int64_t value,
++                                 SimInstruction* instr) {
++  if (addr != LLAddr_) {
++    printf("stdcx. to bad address: 0x%016" PRIx64 ", pc=0x%016" PRIxPTR
++           ", expected: 0x%016" PRIxPTR "\n",
++           addr, reinterpret_cast<intptr_t>(instr), LLAddr_);
++    MOZ_CRASH();
++  }
++
++  if ((addr & kPointerAlignmentMask) == 0) {
++    SharedMem<int64_t*> ptr =
++        SharedMem<int64_t*>::shared(reinterpret_cast<int64_t*>(addr));
++
++    if (!LLBit_) {
++      return 0;
++    }
++
++    LLBit_ = false;
++    LLAddr_ = 0;
++    int64_t expected = lastLLValue_;
++    int64_t old =
++        AtomicOperations::compareExchangeSeqCst(ptr, expected, value);
++    return (old == expected) ? 1 : 0;
++  }
++  printf("Unaligned stdcx. at 0x%016" PRIx64 ", pc=0x%016" PRIxPTR "\n", addr,
++         reinterpret_cast<intptr_t>(instr));
++  MOZ_CRASH();
++  return 0;
++}
++
++// -----------------------------------------------------------------------------
++// Stack limit / recursion helpers.
++
++uintptr_t Simulator::stackLimit() const { return stackLimit_; }
++
++uintptr_t* Simulator::addressOfStackLimit() { return &stackLimit_; }
++
++bool Simulator::overRecursed(uintptr_t newsp) const {
++  if (newsp == 0) {
++    newsp = getRegister(sp);
++  }
++  return newsp <= stackLimit();
++}
++
++bool Simulator::overRecursedWithExtra(uint32_t extra) const {
++  uintptr_t newsp = getRegister(sp) - extra;
++  return newsp <= stackLimit();
++}
++
++void Simulator::format(SimInstruction* instr, const char* format) {
++  printf("Simulator found unsupported instruction:\n 0x%016" PRIxPTR
++         ": %08x %s\n",
++         reinterpret_cast<intptr_t>(instr), instr->instructionBits(), format);
++  MOZ_CRASH();
++}
++
++// -----------------------------------------------------------------------------
++// softwareInterrupt - handle kCallRedirInstr (PPC_stop) and PPC_trap.
++
++ABI_FUNCTION_TYPE_SIM_PROTOTYPES
++
++void Simulator::softwareInterrupt(SimInstruction* instr) {
++  uint32_t instrBits = instr->instructionBits();
++
++  if (instrBits == kCallRedirInstr) {
++    Redirection* redirection = Redirection::FromSwiInstruction(instr);
++    uintptr_t nativeFn =
++        reinterpret_cast<uintptr_t>(redirection->nativeFunction());
++
++    // Get the SP for reading stack arguments.
++    int64_t* sp_ = reinterpret_cast<int64_t*>(getRegister(sp));
++    // Skip past the PPC64 ELFv2 link area (4 doublewords = 32 bytes).
++    sp_ = reinterpret_cast<int64_t*>(reinterpret_cast<uintptr_t>(sp_) + 32);
++
++    // PPC64 ELFv2: integer args in r3-r10, FP args in f1-f13.
++    int64_t a0_ = getRegister(r3);
++    int64_t a1_ = getRegister(r4);
++    int64_t a2_ = getRegister(r5);
++    int64_t a3_ = getRegister(r6);
++    int64_t a4_ = getRegister(r7);
++    int64_t a5_ = getRegister(r8);
++    int64_t a6_ = getRegister(r9);
++    int64_t a7_ = getRegister(r10);
++    // PPC64 ELFv2: FP args in f1-f13, mapped to f0_s..f12_s and f0_d..f12_d.
++    float f0_s = getFpuRegisterFloat(Simulator::f1);
++    float f1_s = getFpuRegisterFloat(Simulator::f2);
++    float f2_s = getFpuRegisterFloat(Simulator::f3);
++    float f3_s = getFpuRegisterFloat(Simulator::f4);
++    float f4_s = getFpuRegisterFloat(Simulator::f5);
++    float f5_s = getFpuRegisterFloat(Simulator::f6);
++    float f6_s = getFpuRegisterFloat(Simulator::f7);
++    float f7_s = getFpuRegisterFloat(Simulator::f8);
++    float f8_s = getFpuRegisterFloat(Simulator::f9);
++    float f9_s = getFpuRegisterFloat(Simulator::f10);
++    float f10_s = getFpuRegisterFloat(Simulator::f11);
++    float f11_s = getFpuRegisterFloat(Simulator::f12);
++    float f12_s = getFpuRegisterFloat(Simulator::f13);
++    double f0_d = getFpuRegisterDouble(Simulator::f1);
++    double f1_d = getFpuRegisterDouble(Simulator::f2);
++    double f2_d = getFpuRegisterDouble(Simulator::f3);
++    double f3_d = getFpuRegisterDouble(Simulator::f4);
++    double f4_d = getFpuRegisterDouble(Simulator::f5);
++    double f5_d = getFpuRegisterDouble(Simulator::f6);
++    double f6_d = getFpuRegisterDouble(Simulator::f7);
++    double f7_d = getFpuRegisterDouble(Simulator::f8);
++    double f8_d = getFpuRegisterDouble(Simulator::f9);
++    double f9_d = getFpuRegisterDouble(Simulator::f10);
++    double f10_d = getFpuRegisterDouble(Simulator::f11);
++    double f11_d = getFpuRegisterDouble(Simulator::f12);
++    double f12_d = getFpuRegisterDouble(Simulator::f13);
++
++    // Suppress unused-variable warnings for higher FP arg registers.
++    // They exist for ABI completeness but few function types use >5 FP args.
++    (void)f4_s; (void)f5_s; (void)f6_s; (void)f7_s; (void)f8_s; (void)f9_s;
++    (void)f10_s; (void)f11_s; (void)f12_s;
++    (void)f4_d; (void)f5_d; (void)f6_d; (void)f7_d; (void)f8_d; (void)f9_d;
++    (void)f10_d; (void)f11_d; (void)f12_d;
++
++    int64_t saved_lr = getLR();
++
++    bool stack_aligned = (getRegister(sp) & (ABIStackAlignment - 1)) == 0;
++    if (!stack_aligned) {
++      fprintf(stderr, "Runtime call with unaligned stack!\n");
++      MOZ_CRASH();
++    }
++
++    if (single_stepping_) {
++      single_step_callback_(single_step_callback_arg_, this, nullptr);
++    }
++
++    switch (redirection->type()) {
++      ABI_FUNCTION_TYPE_PPC64_SIM_DISPATCH
++
++      default:
++        MOZ_CRASH("Unknown function type.");
++    }
++
++    if (single_stepping_) {
++      single_step_callback_(single_step_callback_arg_, this, nullptr);
++    }
++
++    setLR(saved_lr);
++    set_pc(getLR());
++  } else if (instrBits == 0x7FE00008) {
++    // PPC_trap: used for wasm traps.
++    uint8_t* newPC;
++    if (wasm::HandleIllegalInstruction(registerState(), &newPC)) {
++      set_pc(int64_t(newPC));
++      return;
++    }
++    MOZ_CRASH("Unexpected trap instruction");
++  } else {
++    // Other trap-like instructions: enter debugger.
++    ppc64Debugger dbg(this);
++    dbg.debug();
++  }
++}
++
++// -----------------------------------------------------------------------------
++// Stop/breakpoint helpers.
++
++bool Simulator::isWatchpoint(uint32_t code) {
++  return (code <= kMaxWatchpointCode);
++}
++
++void Simulator::printWatchpoint(uint32_t code) {
++  ppc64Debugger dbg(this);
++  ++break_count_;
++  printf("\n---- break %d marker: %20" PRIi64 "  (instr count: %20" PRIi64
++         ") ----\n",
++         code, break_count_, icount_);
++  dbg.printAllRegs();
++}
++
++void Simulator::handleStop(uint32_t code, SimInstruction* instr) {
++  if (isEnabledStop(code)) {
++    ppc64Debugger dbg(this);
++    dbg.stop(instr);
++  } else {
++    set_pc(get_pc() + SimInstruction::kInstrSize);
++  }
++}
++
++bool Simulator::isStopInstruction(SimInstruction* instr) {
++  return instr->instructionBits() == kCallRedirInstr;
++}
++
++bool Simulator::isEnabledStop(uint32_t code) {
++  MOZ_ASSERT(code <= kMaxStopCode);
++  MOZ_ASSERT(code > kMaxWatchpointCode);
++  return !(watchedStops_[code].count_ & kStopDisabledBit);
++}
++
++void Simulator::enableStop(uint32_t code) {
++  if (!isEnabledStop(code)) {
++    watchedStops_[code].count_ &= ~kStopDisabledBit;
++  }
++}
++
++void Simulator::disableStop(uint32_t code) {
++  if (isEnabledStop(code)) {
++    watchedStops_[code].count_ |= kStopDisabledBit;
++  }
++}
++
++void Simulator::increaseStopCounter(uint32_t code) {
++  MOZ_ASSERT(code <= kMaxStopCode);
++  if ((watchedStops_[code].count_ & ~(1 << 31)) == 0x7fffffff) {
++    printf(
++        "Stop counter for code %i has overflowed.\n"
++        "Enabling this code and reseting the counter to 0.\n",
++        code);
++    watchedStops_[code].count_ = 0;
++    enableStop(code);
++  } else {
++    watchedStops_[code].count_++;
++  }
++}
++
++void Simulator::printStopInfo(uint32_t code) {
++  if (code <= kMaxWatchpointCode) {
++    printf("That is a watchpoint, not a stop.\n");
++    return;
++  } else if (code > kMaxStopCode) {
++    printf("Code too large, only %u stops can be used\n", kMaxStopCode + 1);
++    return;
++  }
++  const char* state = isEnabledStop(code) ? "Enabled" : "Disabled";
++  int32_t count = watchedStops_[code].count_ & ~kStopDisabledBit;
++  if (count != 0) {
++    if (watchedStops_[code].desc_) {
++      printf("stop %i - 0x%x: \t%s, \tcounter = %i, \t%s\n", code, code,
++             state, count, watchedStops_[code].desc_);
++    } else {
++      printf("stop %i - 0x%x: \t%s, \tcounter = %i\n", code, code, state,
++             count);
++    }
++  }
++}
++
++// =============================================================================
++// Instruction decoders.
++// =============================================================================
++
++// Compute effective address for D-form instructions.
++// If RA==0, the base is 0 (not GPR[0]).
++static inline int64_t DFormEA(Simulator* sim, SimInstruction* instr,
++                              int16_t offset) {
++  uint32_t ra = instr->raValue();
++  int64_t base = (ra == 0) ? 0 : sim->getRegister(ra);
++  return base + offset;
++}
++
++// Compute effective address for DS-form instructions.
++static inline int64_t DSFormEA(Simulator* sim, SimInstruction* instr,
++                               int16_t offset) {
++  uint32_t ra = instr->raValue();
++  int64_t base = (ra == 0) ? 0 : sim->getRegister(ra);
++  return base + offset;
++}
++
++// Compute effective address for X-form indexed instructions.
++// If RA==0, base is 0 (not GPR[0]).
++static inline int64_t XFormEA(Simulator* sim, SimInstruction* instr) {
++  uint32_t ra = instr->raValue();
++  uint32_t rb = instr->rbValue();
++  int64_t base = (ra == 0) ? 0 : sim->getRegister(ra);
++  return base + sim->getRegister(rb);
++}
++
++// Compute effective address for X-form indexed updates (RA != 0 required).
++static inline int64_t XFormEAUpdate(Simulator* sim, SimInstruction* instr) {
++  uint32_t ra = instr->raValue();
++  uint32_t rb = instr->rbValue();
++  return sim->getRegister(ra) + sim->getRegister(rb);
++}
++
++// -----------------------------------------------------------------------------
++// decodeDFormALU: addi, addis, ori, oris, xori, xoris, andi., andis.,
++//                 cmpi, cmpli, subfic, addic, addic., mulli, twi
++
++void Simulator::decodeDFormALU(SimInstruction* instr) {
++  uint32_t opcode = instr->opcode();
++  uint32_t rt = instr->rtValue();
++  uint32_t ra = instr->raValue();
++  int16_t si = instr->imm16Value();
++  uint16_t ui = instr->uimm16Value();
++
++  switch (opcode) {
++    case 14: {
++      // addi: RT = (RA|0) + SI
++      int64_t base = (ra == 0) ? 0 : getRegister(ra);
++      setRegister(rt, base + (int64_t)si);
++      break;
++    }
++    case 15: {
++      // addis: RT = (RA|0) + (SI << 16)
++      int64_t base = (ra == 0) ? 0 : getRegister(ra);
++      setRegister(rt, base + ((int64_t)si << 16));
++      break;
++    }
++    case 24: {
++      // ori: RA = RS | UI
++      setRegister(ra, getRegister(rt) | (uint64_t)ui);
++      break;
++    }
++    case 25: {
++      // oris: RA = RS | (UI << 16)
++      setRegister(ra, getRegister(rt) | ((uint64_t)ui << 16));
++      break;
++    }
++    case 26: {
++      // xori: RA = RS ^ UI
++      setRegister(ra, getRegister(rt) ^ (uint64_t)ui);
++      break;
++    }
++    case 27: {
++      // xoris: RA = RS ^ (UI << 16)
++      setRegister(ra, getRegister(rt) ^ ((uint64_t)ui << 16));
++      break;
++    }
++    case 28: {
++      // andi.: RA = RS & UI, update CR0
++      int64_t result = getRegister(rt) & (uint64_t)ui;
++      setRegister(ra, result);
++      updateCR0(result);
++      break;
++    }
++    case 29: {
++      // andis.: RA = RS & (UI << 16), update CR0
++      int64_t result = getRegister(rt) & ((uint64_t)ui << 16);
++      setRegister(ra, result);
++      updateCR0(result);
++      break;
++    }
++    case 11: {
++      // cmpi: compare RA with SI, signed
++      uint32_t bf = instr->bfValue();
++      bool l = instr->lBit();
++      if (l) {
++        // 64-bit compare
++        setCRFieldCmp(bf, getRegister(ra), (int64_t)si);
++      } else {
++        // 32-bit compare
++        int32_t ra32 = I32(getRegister(ra));
++        setCRFieldCmp(bf, (int64_t)ra32, (int64_t)(int32_t)si);
++      }
++      break;
++    }
++    case 10: {
++      // cmpli: compare RA with UI, unsigned
++      uint32_t bf = instr->bfValue();
++      bool l = instr->lBit();
++      if (l) {
++        // 64-bit unsigned compare
++        setCRFieldCmpU(bf, U64(getRegister(ra)), (uint64_t)ui);
++      } else {
++        // 32-bit unsigned compare
++        uint32_t ra32 = U32(getRegister(ra));
++        setCRFieldCmpU(bf, (uint64_t)ra32, (uint64_t)ui);
++      }
++      break;
++    }
++    case 8: {
++      // subfic: RT = SI - RA, set CA
++      uint64_t ra_val = U64(getRegister(ra));
++      uint64_t imm = U64((int64_t)si);
++      uint64_t result = imm + ~ra_val + 1;
++      setRegister(rt, I64(result));
++      // CA is set if there is a carry out of the addition (~RA + IMM + 1).
++      // Equivalently, CA = (IMM >= RA) for unsigned interpretation of the
++      // full 64-bit subtraction.
++      bool carry = (imm >= ra_val) || (imm == 0 && ra_val == 0);
++      // More precise: carry = (~ra_val + imm) would overflow, or adding 1
++      // overflows.
++      uint64_t tmp = ~ra_val + imm;
++      carry = (tmp < ~ra_val) || (tmp < imm) || (result < tmp);
++      // Simplify: CA if no borrow.
++      carry = (U64((int64_t)si) >= ra_val);
++      if (ra_val == 0) carry = true;
++      // Actually, subfic CA: carry out of ~RA + IMM + 1.
++      // CA = (IMM > RA - 1) when RA != 0, CA = 1 when RA == 0.
++      // Or just: the unsigned result of (SI - RA) is valid (no borrow).
++      // Let's compute it correctly:
++      {
++        __uint128_t wide = (__uint128_t)(~ra_val) + (__uint128_t)imm + 1;
++        carry = (wide >> 64) != 0;
++      }
++      setXERCA(carry);
++      break;
++    }
++    case 12: {
++      // addic: RT = RA + SI, set CA
++      uint64_t ra_val = U64(getRegister(ra));
++      uint64_t imm = U64((int64_t)si);
++      uint64_t result = ra_val + imm;
++      setRegister(rt, I64(result));
++      setXERCA(result < ra_val);
++      break;
++    }
++    case 13: {
++      // addic.: RT = RA + SI, set CA, update CR0
++      uint64_t ra_val = U64(getRegister(ra));
++      uint64_t imm = U64((int64_t)si);
++      uint64_t result = ra_val + imm;
++      setRegister(rt, I64(result));
++      setXERCA(result < ra_val);
++      updateCR0(I64(result));
++      break;
++    }
++    case 7: {
++      // mulli: RT = RA * SI (low 64 bits)
++      int64_t result = getRegister(ra) * (int64_t)si;
++      setRegister(rt, result);
++      break;
++    }
++    case 3: {
++      // twi: Trap Word Immediate. We don't implement trapping in the
++      // simulator; just continue.
++      break;
++    }
++    default:
++      MOZ_CRASH_UNSAFE_PRINTF("decodeDFormALU: unhandled opcode %u", opcode);
++  }
++}
++
++// -----------------------------------------------------------------------------
++// decodeDFormLoad: lwz(32), lbz(34), lhz(40), lha(42), lfs(48), lfd(50)
++//   and update variants
++
++void Simulator::decodeDFormLoad(SimInstruction* instr) {
++  uint32_t opcode = instr->opcode();
++  uint32_t rt = instr->rtValue();
++  int16_t si = instr->imm16Value();
++  uint64_t ea = DFormEA(this, instr, si);
++
++  switch (opcode) {
++    case 32:
++      // lwz
++      setRegister(rt, U64(readWU(ea, instr)));
++      break;
++    case 33: {
++      // lwzu: RA != 0, load and update RA
++      setRegister(rt, U64(readWU(ea, instr)));
++      setRegister(instr->raValue(), ea);
++      break;
++    }
++    case 34:
++      // lbz
++      setRegister(rt, U64(readBU(ea)));
++      break;
++    case 35: {
++      // lbzu
++      setRegister(rt, U64(readBU(ea)));
++      setRegister(instr->raValue(), ea);
++      break;
++    }
++    case 40:
++      // lhz
++      setRegister(rt, U64(readHU(ea, instr)));
++      break;
++    case 41: {
++      // lhzu
++      setRegister(rt, U64(readHU(ea, instr)));
++      setRegister(instr->raValue(), ea);
++      break;
++    }
++    case 42:
++      // lha (half-word, sign-extended)
++      setRegister(rt, (int64_t)readH(ea, instr));
++      break;
++    case 43: {
++      // lhau
++      setRegister(rt, (int64_t)readH(ea, instr));
++      setRegister(instr->raValue(), ea);
++      break;
++    }
++    case 48: {
++      // lfs: load float single, widen to double in FPR (NaN-preserving;
++      // matches Power ISA `lfs` which uses xscvspdpn semantics)
++      if (handleWasmSegFault(ea, 4)) break;
++      float val = *reinterpret_cast<float*>(ea);
++      setFpuRegisterDouble(rt, promoteFloatPreservingNaN(val));
++      break;
++    }
++    case 49: {
++      // lfsu
++      if (handleWasmSegFault(ea, 4)) break;
++      float val = *reinterpret_cast<float*>(ea);
++      setFpuRegisterDouble(rt, promoteFloatPreservingNaN(val));
++      setRegister(instr->raValue(), ea);
++      break;
++    }
++    case 50: {
++      // lfd: load float double
++      double val = readD(ea, instr);
++      setFpuRegisterDouble(rt, val);
++      break;
++    }
++    case 51: {
++      // lfdu
++      double val = readD(ea, instr);
++      setFpuRegisterDouble(rt, val);
++      setRegister(instr->raValue(), ea);
++      break;
++    }
++    default:
++      MOZ_CRASH_UNSAFE_PRINTF("decodeDFormLoad: unhandled opcode %u", opcode);
++  }
++}
++
++// -----------------------------------------------------------------------------
++// decodeDFormStore: stw(36), stwu(37), stb(38), sth(44), stfs(52), stfd(54)
++//   and update variants
++
++void Simulator::decodeDFormStore(SimInstruction* instr) {
++  uint32_t opcode = instr->opcode();
++  uint32_t rs = instr->rsValue();
++  int16_t si = instr->imm16Value();
++
++  // For stores, the effective address calculation differs for update forms:
++  // - Non-update: EA = (RA|0) + D
++  // - Update: EA = RA + D (RA must not be 0)
++  bool isUpdate = false;
++  switch (opcode) {
++    case 37: case 39: case 45: case 53: case 55:
++      isUpdate = true;
++      break;
++  }
++
++  uint64_t ea;
++  if (isUpdate) {
++    ea = getRegister(instr->raValue()) + (int64_t)si;
++  } else {
++    ea = DFormEA(this, instr, si);
++  }
++
++  switch (opcode) {
++    case 36:
++      // stw
++      writeW(ea, I32(getRegister(rs)), instr);
++      break;
++    case 38:
++      // stb
++      writeB(ea, (uint8_t)(getRegister(rs) & 0xFF));
++      break;
++    case 39:
++      // stbu
++      writeB(ea, (uint8_t)(getRegister(rs) & 0xFF));
++      setRegister(instr->raValue(), ea);
++      break;
++    case 44:
++      // sth
++      writeH(ea, U16(getRegister(rs)), instr);
++      break;
++    case 45:
++      // sthu
++      writeH(ea, U16(getRegister(rs)), instr);
++      setRegister(instr->raValue(), ea);
++      break;
++    case 52: {
++      // stfs: convert double in FPR to single and store (NaN-preserving;
++      // matches Power ISA `stfs` which uses xscvdpspn semantics)
++      double dval = getFpuRegisterDouble(rs);
++      float fval = demoteDoublePreservingNaN(dval);
++      if (handleWasmSegFault(ea, 4)) break;
++      *reinterpret_cast<float*>(ea) = fval;
++      LLBit_ = false;
++      break;
++    }
++    case 53: {
++      // stfsu
++      double dval = getFpuRegisterDouble(rs);
++      float fval = demoteDoublePreservingNaN(dval);
++      if (handleWasmSegFault(ea, 4)) break;
++      *reinterpret_cast<float*>(ea) = fval;
++      LLBit_ = false;
++      setRegister(instr->raValue(), ea);
++      break;
++    }
++    case 54:
++      // stfd
++      writeD(ea, getFpuRegisterDouble(rs), instr);
++      break;
++    case 55:
++      // stfdu
++      writeD(ea, getFpuRegisterDouble(rs), instr);
++      setRegister(instr->raValue(), ea);
++      break;
++    default:
++      MOZ_CRASH_UNSAFE_PRINTF("decodeDFormStore: unhandled opcode %u", opcode);
++  }
++}
++
++// -----------------------------------------------------------------------------
++// decodeDSForm: ld(58/0), lwa(58/2), std(62/0), stdu(62/1)
++
++void Simulator::decodeDSForm(SimInstruction* instr) {
++  uint32_t opcode = instr->opcode();
++  uint32_t rt = instr->rtValue();
++  int16_t ds = instr->ds14Value();
++  uint32_t xo = instr->bits(1, 0);
++
++  if (opcode == 58) {
++    uint64_t ea = DSFormEA(this, instr, ds);
++    switch (xo) {
++      case 0:
++        // ld
++        setRegister(rt, readDW(ea, instr));
++        break;
++      case 1: {
++        // ldu
++        setRegister(rt, readDW(ea, instr));
++        setRegister(instr->raValue(), ea);
++        break;
++      }
++      case 2:
++        // lwa (load word algebraic, sign-extended to 64)
++        setRegister(rt, (int64_t)readW(ea, instr));
++        break;
++      default:
++        MOZ_CRASH_UNSAFE_PRINTF("decodeDSForm: opcode 58, xo=%u", xo);
++    }
++  } else if (opcode == 62) {
++    // For std/stdu, EA uses RA directly (no RA|0 rule).
++    uint64_t ea;
++    if (xo == 1) {
++      // stdu: update form
++      ea = getRegister(instr->raValue()) + (int64_t)ds;
++    } else {
++      ea = DSFormEA(this, instr, ds);
++    }
++    switch (xo) {
++      case 0:
++        // std
++        writeDW(ea, getRegister(rt), instr);
++        break;
++      case 1:
++        // stdu
++        writeDW(ea, getRegister(rt), instr);
++        setRegister(instr->raValue(), ea);
++        break;
++      default:
++        MOZ_CRASH_UNSAFE_PRINTF("decodeDSForm: opcode 62, xo=%u", xo);
++    }
++  } else {
++    MOZ_CRASH_UNSAFE_PRINTF("decodeDSForm: unhandled opcode %u", opcode);
++  }
++}
++
++// -----------------------------------------------------------------------------
++// decodeXForm: Major opcode 31 (X-form, XO-form, etc.)
++// This is the largest decoder covering most ALU, indexed load/store, SPR,
++// and atomic instructions.
++
++void Simulator::decodeXForm(SimInstruction* instr) {
++  uint32_t xo = instr->xoValue();
++  uint32_t rt = instr->rtValue();
++  uint32_t ra = instr->raValue();
++  uint32_t rb = instr->rbValue();
++  bool rc = instr->rcBit();
++
++  // Many instructions share major opcode 31. Switch on extended opcode.
++  // For XO-form with OE=1, the xoValue() includes bit 10, so
++  // addo (266 | 512 = 778) etc. are separate cases.
++
++  // First check for isel which uses bits 1-5 = 15 (XO = 15 in bits 1..5).
++  if ((xo & 0x1F) == 15) {
++    // isel: if CR[BC] then RT=RA else RT=RB
++    // BC is in bits 6..10 (the rc field position).
++    uint32_t bc = instr->rcValue();
++    uint32_t crField = bc / 4;
++    uint32_t crBit = bc % 4;
++    uint8_t crFieldVal = getCRField(crField);
++    // PPC CR field bits: bit3=LT(8), bit2=GT(4), bit1=EQ(2), bit0=SO(1)
++    // Bit numbering within field: 0=LT, 1=GT, 2=EQ, 3=SO
++    bool bitSet;
++    switch (crBit) {
++      case 0: bitSet = (crFieldVal & kCRFieldLT) != 0; break;
++      case 1: bitSet = (crFieldVal & kCRFieldGT) != 0; break;
++      case 2: bitSet = (crFieldVal & kCRFieldEQ) != 0; break;
++      case 3: bitSet = (crFieldVal & kCRFieldSO) != 0; break;
++      default: bitSet = false; break;
++    }
++    int64_t raVal = (ra == 0) ? 0 : getRegister(ra);
++    int64_t rbVal = getRegister(rb);
++    setRegister(rt, bitSet ? raVal : rbVal);
++    return;
++  }
++
++  switch (xo) {
++    // --- Arithmetic ---
++    case 266: {
++      // add
++      int64_t result = getRegister(ra) + getRegister(rb);
++      setRegister(rt, result);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 778: {
++      // addo
++      int64_t ra_val = getRegister(ra);
++      int64_t rb_val = getRegister(rb);
++      int64_t result = ra_val + rb_val;
++      setRegister(rt, result);
++      // Overflow if signs of inputs are same but result sign differs.
++      bool ov = ((ra_val ^ result) & (rb_val ^ result)) < 0;
++      setXEROV(ov);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 10: {
++      // addc
++      uint64_t ra_val = U64(getRegister(ra));
++      uint64_t rb_val = U64(getRegister(rb));
++      uint64_t result = ra_val + rb_val;
++      setRegister(rt, I64(result));
++      setXERCA(result < ra_val);
++      if (rc) updateCR0(I64(result));
++      break;
++    }
++    case 138: {
++      // adde: RT = RA + RB + CA
++      uint64_t ra_val = U64(getRegister(ra));
++      uint64_t rb_val = U64(getRegister(rb));
++      uint64_t ca = getXERCA() ? 1ULL : 0ULL;
++      uint64_t result = ra_val + rb_val + ca;
++      setRegister(rt, I64(result));
++      // Carry-out: when ca==0, only the ra+rb wrap matters; when ca==1,
++      // an additional wrap occurs iff result <= ra_val.
++      bool newCA = ca ? (result <= ra_val) : (result < ra_val);
++      setXERCA(newCA);
++      if (rc) updateCR0(I64(result));
++      break;
++    }
++    case 234: {
++      // addme: RT = RA + CA - 1
++      uint64_t ra_val = U64(getRegister(ra));
++      uint64_t ca = getXERCA() ? 1ULL : 0ULL;
++      uint64_t result = ra_val + ca + ~0ULL;  // + CA + (-1)
++      setRegister(rt, I64(result));
++      // CA if carry out of (RA + CA + 0xFFFFFFFFFFFFFFFF)
++      bool newCA = (ra_val != 0) || (ca != 0);
++      setXERCA(newCA);
++      if (rc) updateCR0(I64(result));
++      break;
++    }
++    case 202: {
++      // addze: RT = RA + CA
++      uint64_t ra_val = U64(getRegister(ra));
++      uint64_t ca = getXERCA() ? 1ULL : 0ULL;
++      uint64_t result = ra_val + ca;
++      setRegister(rt, I64(result));
++      setXERCA(result < ra_val);
++      if (rc) updateCR0(I64(result));
++      break;
++    }
++    case 40: {
++      // subf: RT = RB - RA
++      int64_t result = getRegister(rb) - getRegister(ra);
++      setRegister(rt, result);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 552: {
++      // subfo: RT = RB - RA, set OV
++      int64_t ra_val = getRegister(ra);
++      int64_t rb_val = getRegister(rb);
++      int64_t result = rb_val - ra_val;
++      setRegister(rt, result);
++      bool ov = ((rb_val ^ ra_val) & (rb_val ^ result)) < 0;
++      setXEROV(ov);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 8: {
++      // subfc: RT = ~RA + RB + 1
++      uint64_t ra_val = U64(getRegister(ra));
++      uint64_t rb_val = U64(getRegister(rb));
++      uint64_t result = ~ra_val + rb_val + 1;
++      setRegister(rt, I64(result));
++      // CA = no borrow = (RB >= RA unsigned)
++      setXERCA(rb_val >= ra_val);
++      if (rc) updateCR0(I64(result));
++      break;
++    }
++    case 136: {
++      // subfe: RT = ~RA + RB + CA
++      uint64_t ra_val = U64(getRegister(ra));
++      uint64_t rb_val = U64(getRegister(rb));
++      uint64_t ca = getXERCA() ? 1ULL : 0ULL;
++      uint64_t result = ~ra_val + rb_val + ca;
++      setRegister(rt, I64(result));
++      __uint128_t wide = (__uint128_t)(~ra_val) + (__uint128_t)rb_val + ca;
++      setXERCA((wide >> 64) != 0);
++      if (rc) updateCR0(I64(result));
++      break;
++    }
++    case 232: {
++      // subfze: RT = ~RA + CA
++      uint64_t ra_val = U64(getRegister(ra));
++      uint64_t ca = getXERCA() ? 1ULL : 0ULL;
++      uint64_t result = ~ra_val + ca;
++      setRegister(rt, I64(result));
++      setXERCA(ca > ra_val);  // CA if ~RA + CA overflows
++      if (rc) updateCR0(I64(result));
++      break;
++    }
++    case 104: {
++      // neg: RT = -RA
++      int64_t result = -getRegister(ra);
++      setRegister(rt, result);
++      if (rc) updateCR0(result);
++      break;
++    }
++
++    // --- Multiply ---
++    case 233: {
++      // mulld: RT = RA * RB (low 64 bits)
++      int64_t result = getRegister(ra) * getRegister(rb);
++      setRegister(rt, result);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 745: {
++      // mulldo: RT = RA * RB, set OV
++      int64_t ra_val = getRegister(ra);
++      int64_t rb_val = getRegister(rb);
++      int64_t result = ra_val * rb_val;
++      setRegister(rt, result);
++      // OV if high part of full 128-bit product is not all-sign.
++      int64_t hi = MultiplyHighSigned(ra_val, rb_val);
++      bool ov = (hi != (result >> 63));
++      setXEROV(ov);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 235: {
++      // mullw: RT = sign_ext(RA[32:63] * RB[32:63])
++      int64_t result = (int64_t)I32(getRegister(ra)) *
++                       (int64_t)I32(getRegister(rb));
++      setRegister(rt, result);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 747: {
++      // mullwo
++      int64_t ra_val = I32(getRegister(ra));
++      int64_t rb_val = I32(getRegister(rb));
++      int64_t result = ra_val * rb_val;
++      setRegister(rt, result);
++      bool ov = (result != (int64_t)I32(result));
++      setXEROV(ov);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 73: {
++      // mulhd: RT = high 64 bits of RA * RB (signed)
++      setRegister(rt, MultiplyHighSigned(getRegister(ra), getRegister(rb)));
++      if (rc) updateCR0(getRegister(rt));
++      break;
++    }
++    case 9: {
++      // mulhdu: RT = high 64 bits of RA * RB (unsigned)
++      setRegister(rt, I64(MultiplyHighUnsigned(U64(getRegister(ra)),
++                                               U64(getRegister(rb)))));
++      if (rc) updateCR0(getRegister(rt));
++      break;
++    }
++    case 75: {
++      // mulhw: RT = high 32 bits of (RA[32:63] * RB[32:63]), signed
++      int64_t result =
++          (int64_t)I32(getRegister(ra)) * (int64_t)I32(getRegister(rb));
++      setRegister(rt, result >> 32);
++      if (rc) updateCR0(getRegister(rt));
++      break;
++    }
++    case 11: {
++      // mulhwu: RT = high 32 bits, unsigned
++      uint64_t result =
++          (uint64_t)U32(getRegister(ra)) * (uint64_t)U32(getRegister(rb));
++      setRegister(rt, I64(result >> 32));
++      if (rc) updateCR0(getRegister(rt));
++      break;
++    }
++
++    // --- Divide ---
++    case 489: {
++      // divd: RT = RA / RB (signed, 64-bit)
++      int64_t ra_val = getRegister(ra);
++      int64_t rb_val = getRegister(rb);
++      if (rb_val == 0 || (ra_val == INT64_MIN && rb_val == -1)) {
++        setRegister(rt, 0);
++      } else {
++        setRegister(rt, ra_val / rb_val);
++      }
++      if (rc) updateCR0(getRegister(rt));
++      break;
++    }
++    case 1001: {
++      // divdo
++      int64_t ra_val = getRegister(ra);
++      int64_t rb_val = getRegister(rb);
++      bool ov = (rb_val == 0) || (ra_val == INT64_MIN && rb_val == -1);
++      if (ov) {
++        setRegister(rt, 0);
++      } else {
++        setRegister(rt, ra_val / rb_val);
++      }
++      setXEROV(ov);
++      if (rc) updateCR0(getRegister(rt));
++      break;
++    }
++    case 457: {
++      // divdu: unsigned 64-bit divide
++      uint64_t ra_val = U64(getRegister(ra));
++      uint64_t rb_val = U64(getRegister(rb));
++      if (rb_val == 0) {
++        setRegister(rt, 0);
++      } else {
++        setRegister(rt, I64(ra_val / rb_val));
++      }
++      if (rc) updateCR0(getRegister(rt));
++      break;
++    }
++    case 969: {
++      // divduo
++      uint64_t ra_val = U64(getRegister(ra));
++      uint64_t rb_val = U64(getRegister(rb));
++      bool ov = (rb_val == 0);
++      if (ov) {
++        setRegister(rt, 0);
++      } else {
++        setRegister(rt, I64(ra_val / rb_val));
++      }
++      setXEROV(ov);
++      if (rc) updateCR0(getRegister(rt));
++      break;
++    }
++    case 491: {
++      // divw: signed 32-bit divide
++      int32_t ra_val = I32(getRegister(ra));
++      int32_t rb_val = I32(getRegister(rb));
++      if (rb_val == 0 || (ra_val == INT32_MIN && rb_val == -1)) {
++        setRegister(rt, 0);
++      } else {
++        setRegister(rt, (int64_t)(ra_val / rb_val));
++      }
++      if (rc) updateCR0(getRegister(rt));
++      break;
++    }
++    case 1003: {
++      // divwo
++      int32_t ra_val = I32(getRegister(ra));
++      int32_t rb_val = I32(getRegister(rb));
++      bool ov = (rb_val == 0) || (ra_val == INT32_MIN && rb_val == -1);
++      if (ov) {
++        setRegister(rt, 0);
++      } else {
++        setRegister(rt, (int64_t)(ra_val / rb_val));
++      }
++      setXEROV(ov);
++      if (rc) updateCR0(getRegister(rt));
++      break;
++    }
++    case 459: {
++      // divwu: unsigned 32-bit divide
++      uint32_t ra_val = U32(getRegister(ra));
++      uint32_t rb_val = U32(getRegister(rb));
++      if (rb_val == 0) {
++        setRegister(rt, 0);
++      } else {
++        setRegister(rt, (int64_t)(ra_val / rb_val));
++      }
++      if (rc) updateCR0(getRegister(rt));
++      break;
++    }
++    case 971: {
++      // divwuo
++      uint32_t ra_val = U32(getRegister(ra));
++      uint32_t rb_val = U32(getRegister(rb));
++      bool ov = (rb_val == 0);
++      if (ov) {
++        setRegister(rt, 0);
++      } else {
++        setRegister(rt, (int64_t)(ra_val / rb_val));
++      }
++      setXEROV(ov);
++      if (rc) updateCR0(getRegister(rt));
++      break;
++    }
++
++    // --- POWER9 modulo (ISA 3.0) ---
++    // Result of "undefined" division (rb_val == 0, or signed INT_MIN / -1)
++    // is implementation-defined per Power ISA; matching the divX behaviour
++    // above, we yield 0 in those cases. Rc has no encoding for these ops.
++    case 779: {
++      // modsw: RT = RA % RB (signed, 32-bit)
++      int32_t ra_val = I32(getRegister(ra));
++      int32_t rb_val = I32(getRegister(rb));
++      if (rb_val == 0 || (ra_val == INT32_MIN && rb_val == -1)) {
++        setRegister(rt, 0);
++      } else {
++        setRegister(rt, (int64_t)(ra_val % rb_val));
++      }
++      break;
++    }
++    case 267: {
++      // moduw: RT = RA % RB (unsigned, 32-bit)
++      uint32_t ra_val = U32(getRegister(ra));
++      uint32_t rb_val = U32(getRegister(rb));
++      if (rb_val == 0) {
++        setRegister(rt, 0);
++      } else {
++        setRegister(rt, (int64_t)(ra_val % rb_val));
++      }
++      break;
++    }
++    case 777: {
++      // modsd: RT = RA % RB (signed, 64-bit)
++      int64_t ra_val = getRegister(ra);
++      int64_t rb_val = getRegister(rb);
++      if (rb_val == 0 || (ra_val == INT64_MIN && rb_val == -1)) {
++        setRegister(rt, 0);
++      } else {
++        setRegister(rt, ra_val % rb_val);
++      }
++      break;
++    }
++    case 265: {
++      // modud: RT = RA % RB (unsigned, 64-bit)
++      uint64_t ra_val = U64(getRegister(ra));
++      uint64_t rb_val = U64(getRegister(rb));
++      if (rb_val == 0) {
++        setRegister(rt, 0);
++      } else {
++        setRegister(rt, I64(ra_val % rb_val));
++      }
++      break;
++    }
++
++    // --- Logical ---
++    case 28: {
++      // and: RA = RS & RB
++      int64_t result = getRegister(rt) & getRegister(rb);
++      setRegister(ra, result);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 60: {
++      // andc: RA = RS & ~RB
++      int64_t result = getRegister(rt) & ~getRegister(rb);
++      setRegister(ra, result);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 444: {
++      // or: RA = RS | RB
++      int64_t result = getRegister(rt) | getRegister(rb);
++      setRegister(ra, result);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 412: {
++      // orc: RA = RS | ~RB
++      int64_t result = getRegister(rt) | ~getRegister(rb);
++      setRegister(ra, result);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 316: {
++      // xor: RA = RS ^ RB
++      int64_t result = getRegister(rt) ^ getRegister(rb);
++      setRegister(ra, result);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 476: {
++      // nand: RA = ~(RS & RB)
++      int64_t result = ~(getRegister(rt) & getRegister(rb));
++      setRegister(ra, result);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 124: {
++      // nor: RA = ~(RS | RB)
++      int64_t result = ~(getRegister(rt) | getRegister(rb));
++      setRegister(ra, result);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 284: {
++      // eqv: RA = ~(RS ^ RB)
++      int64_t result = ~(getRegister(rt) ^ getRegister(rb));
++      setRegister(ra, result);
++      if (rc) updateCR0(result);
++      break;
++    }
++
++    // --- Shifts ---
++    case 27: {
++      // sld: RA = RS << RB[58:63] if RB[57]==0, else RA=0
++      uint64_t shift = U64(getRegister(rb));
++      uint64_t rs_val = U64(getRegister(rt));
++      int64_t result;
++      if (shift & 0x40) {
++        result = 0;
++      } else {
++        result = I64(rs_val << (shift & 0x3F));
++      }
++      setRegister(ra, result);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 24: {
++      // slw: RA = RS[32:63] << RB[59:63] if RB[58]==0, else RA=0 (32-bit)
++      uint32_t shift = U32(getRegister(rb));
++      uint32_t rs_val = U32(getRegister(rt));
++      uint32_t result;
++      if (shift & 0x20) {
++        result = 0;
++      } else {
++        result = rs_val << (shift & 0x1F);
++      }
++      setRegister(ra, (int64_t)(int32_t)result);
++      if (rc) updateCR0(getRegister(ra));
++      break;
++    }
++    case 539: {
++      // srd: RA = RS >> RB[58:63] if RB[57]==0, else RA=0 (logical)
++      uint64_t shift = U64(getRegister(rb));
++      uint64_t rs_val = U64(getRegister(rt));
++      int64_t result;
++      if (shift & 0x40) {
++        result = 0;
++      } else {
++        result = I64(rs_val >> (shift & 0x3F));
++      }
++      setRegister(ra, result);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 536: {
++      // srw: RA = RS[32:63] >> RB[59:63] logical (32-bit)
++      uint32_t shift = U32(getRegister(rb));
++      uint32_t rs_val = U32(getRegister(rt));
++      uint32_t result;
++      if (shift & 0x20) {
++        result = 0;
++      } else {
++        result = rs_val >> (shift & 0x1F);
++      }
++      setRegister(ra, (int64_t)(int32_t)result);
++      if (rc) updateCR0(getRegister(ra));
++      break;
++    }
++    case 794: {
++      // srad: RA = RS >> RB[58:63] arithmetic (64-bit), set CA
++      uint64_t shift = U64(getRegister(rb));
++      int64_t rs_val = getRegister(rt);
++      int64_t result;
++      bool carry;
++      if (shift & 0x40) {
++        result = rs_val >> 63;  // all sign bits
++        carry = (rs_val < 0);
++      } else {
++        uint32_t sh = shift & 0x3F;
++        result = rs_val >> sh;
++        // CA = 1 if RS is negative and any 1-bits were shifted out.
++        carry = (rs_val < 0) && ((rs_val & ((1ULL << sh) - 1)) != 0);
++      }
++      setRegister(ra, result);
++      setXERCA(carry);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 792: {
++      // sraw: RA = RS[32:63] >> RB[59:63] arithmetic (32-bit), set CA
++      uint32_t shift = U32(getRegister(rb));
++      int32_t rs_val = I32(getRegister(rt));
++      int32_t result;
++      bool carry;
++      if (shift & 0x20) {
++        result = rs_val >> 31;
++        carry = (rs_val < 0);
++      } else {
++        uint32_t sh = shift & 0x1F;
++        result = rs_val >> sh;
++        carry = (rs_val < 0) && ((rs_val & ((1U << sh) - 1)) != 0);
++      }
++      setRegister(ra, (int64_t)result);
++      setXERCA(carry);
++      if (rc) updateCR0(getRegister(ra));
++      break;
++    }
++    case 826:
++    case 827: {
++      // sradi RA, RS, SH: RA = EXTS(RS) >> sh arithmetic (64-bit), set CA.
++      // XS-form, XO=413 (9-bit, bits 21-29), sh[5] at bit 30, Rc at bit 31.
++      // Our xoValue() extracts bits 10:1 (10 bits)
++      // which yields 413*2 + sh[5] = 826 (sh[5]=0) or 827 (sh[5]=1).
++      // sh[0:4] at instruction bits 15:11 (= raValue field position, but
++      // for this XS-form they're the SH[0:4] subfield).
++      uint32_t sh = instr->bits(15, 11) | (instr->bit(1) << 5);
++      int64_t rs_val = getRegister(rt);
++      int64_t result = (sh == 0) ? rs_val : (rs_val >> sh);
++      // CA := rs_val < 0 && any bits shifted out are 1.
++      bool carry = (rs_val < 0) && sh > 0 &&
++                   ((U64(rs_val) & ((1ULL << sh) - 1)) != 0);
++      setRegister(ra, result);
++      setXERCA(carry);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 824: {
++      // srawi: RA = RS[32:63] >> SH arithmetic (32-bit), set CA
++      uint32_t sh = instr->bits(15, 11);
++      int32_t rs_val = I32(getRegister(rt));
++      int32_t result = rs_val >> sh;
++      bool carry = (rs_val < 0) && sh > 0 &&
++                   ((U32(rs_val) & ((1U << sh) - 1)) != 0);
++      setRegister(ra, (int64_t)result);
++      setXERCA(carry);
++      if (rc) updateCR0(getRegister(ra));
++      break;
++    }
++
++    // --- Extend / count ---
++    case 954: {
++      // extsb: RA = sign_ext(RS[56:63])
++      int64_t result = (int64_t)(int8_t)(getRegister(rt) & 0xFF);
++      setRegister(ra, result);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 922: {
++      // extsh: RA = sign_ext(RS[48:63])
++      int64_t result = (int64_t)(int16_t)(getRegister(rt) & 0xFFFF);
++      setRegister(ra, result);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 986: {
++      // extsw: RA = sign_ext(RS[32:63])
++      int64_t result = (int64_t)(int32_t)(getRegister(rt) & 0xFFFFFFFF);
++      setRegister(ra, result);
++      if (rc) updateCR0(result);
++      break;
++    }
++    case 58: {
++      // cntlzd: RA = count leading zeros of RS (64-bit)
++      setRegister(ra, CountLeadingZeros64(U64(getRegister(rt))));
++      if (rc) updateCR0(getRegister(ra));
++      break;
++    }
++    case 26: {
++      // cntlzw: RA = count leading zeros of RS[32:63] (32-bit)
++      setRegister(ra, CountLeadingZeros32(U32(getRegister(rt))));
++      if (rc) updateCR0(getRegister(ra));
++      break;
++    }
++    case 570: {
++      // cnttzd
++      setRegister(ra, CountTrailingZeros64(U64(getRegister(rt))));
++      if (rc) updateCR0(getRegister(ra));
++      break;
++    }
++    case 538: {
++      // cnttzw
++      setRegister(ra, CountTrailingZeros32(U32(getRegister(rt))));
++      if (rc) updateCR0(getRegister(ra));
++      break;
++    }
++    case 506: {
++      // popcntd
++      setRegister(ra, PopCount64(U64(getRegister(rt))));
++      break;
++    }
++    case 378: {
++      // popcntw: popcount each 32-bit half independently, sum in each half
++      uint64_t val = U64(getRegister(rt));
++      uint32_t lo = PopCount32(U32(val));
++      uint32_t hi = PopCount32(U32(val >> 32));
++      setRegister(ra, I64(((uint64_t)hi << 32) | lo));
++      break;
++    }
++    case 122: {
++      // popcntb: popcount each byte independently
++      setRegister(ra, I64(PopCountPerByte(U64(getRegister(rt)))));
++      break;
++    }
++    case 187: {
++      // brd (POWER10): RA = byte-reverse(RS) full 64-bit doubleword.
++      setRegister(ra, I64(__builtin_bswap64(U64(getRegister(rt)))));
++      break;
++    }
++    case 219: {
++      // brh (POWER10): byte-reverse each of the 4 halfwords in RS.
++      uint64_t v = U64(getRegister(rt));
++      uint64_t out = ((v & 0xFF00FF00FF00FF00ULL) >> 8) |
++                     ((v & 0x00FF00FF00FF00FFULL) << 8);
++      setRegister(ra, I64(out));
++      break;
++    }
++    case 155: {
++      // brw (POWER10): byte-reverse each of the 2 words in RS.
++      uint64_t v = U64(getRegister(rt));
++      uint64_t out = ((uint64_t)__builtin_bswap32((uint32_t)(v >> 32)) << 32) |
++                     (uint64_t)__builtin_bswap32((uint32_t)v);
++      setRegister(ra, I64(out));
++      break;
++    }
++
++    // --- Compare (X-form) ---
++    case 0: {
++      // cmp (cmpw/cmpd): signed compare
++      uint32_t bf = instr->bfValue();
++      bool l = instr->lBit();
++      if (l) {
++        setCRFieldCmp(bf, getRegister(ra), getRegister(rb));
++      } else {
++        setCRFieldCmp(bf, (int64_t)I32(getRegister(ra)),
++                      (int64_t)I32(getRegister(rb)));
++      }
++      break;
++    }
++    case 32: {
++      // cmpl (cmplw/cmpld): unsigned compare
++      uint32_t bf = instr->bfValue();
++      bool l = instr->lBit();
++      if (l) {
++        setCRFieldCmpU(bf, U64(getRegister(ra)), U64(getRegister(rb)));
++      } else {
++        setCRFieldCmpU(bf, (uint64_t)U32(getRegister(ra)),
++                       (uint64_t)U32(getRegister(rb)));
++      }
++      break;
++    }
++
++    // --- Trap ---
++    case 4: {
++      // tw: Trap Word. The JIT uses this for debugging / tagging.
++      // In the simulator we just treat it as a NOP (the JIT uses tagged
++      // trap words that are never actually reached during normal execution,
++      // they serve as metadata for the patcher).
++      break;
++    }
++
++    // --- SPR ---
++    case 339: {
++      // mfspr: RT = SPR
++      // SPR encoding: spr[4:0] at bits 16..20, spr[9:5] at bits 11..15
++      uint32_t spr_lo = instr->raValue();  // bits 16..20
++      uint32_t spr_hi = instr->rbValue();  // bits 11..15
++      uint32_t spr = (spr_lo) | (spr_hi << 5);
++      switch (spr) {
++        case 8:  // LR
++          setRegister(rt, getLR());
++          break;
++        case 9:  // CTR
++          setRegister(rt, getCTR());
++          break;
++        case 1:  // XER
++          setRegister(rt, I64(getXER()));
++          break;
++        default:
++          MOZ_CRASH_UNSAFE_PRINTF("mfspr: unhandled SPR %u", spr);
++      }
++      break;
++    }
++    case 467: {
++      // mtspr: SPR = RS
++      uint32_t spr_lo = instr->raValue();
++      uint32_t spr_hi = instr->rbValue();
++      uint32_t spr = (spr_lo) | (spr_hi << 5);
++      int64_t val = getRegister(rt);
++      switch (spr) {
++        case 8:  // LR
++          setLR(val);
++          break;
++        case 9:  // CTR
++          setCTR(val);
++          break;
++        case 1:  // XER
++          setXER(U64(val));
++          break;
++        default:
++          MOZ_CRASH_UNSAFE_PRINTF("mtspr: unhandled SPR %u", spr);
++      }
++      break;
++    }
++    case 19: {
++      // mfocrf: read one CR field selected by the FXM bitmask into RT.
++      // (Plain mfcr shares this XO with FXM=0; we model both by reading
++      // the full CR — the JIT only emits mfocrf and the bits outside the
++      // selected field are spec'd "undefined", so reading the full CR is
++      // a valid implementation.)
++      setRegister(rt, (int64_t)getCR());
++      break;
++    }
++    case 144: {
++      // mtcrf: move to CR fields
++      // FXM field is in bits 12..19.
++      uint32_t fxm = instr->bits(19, 12);
++      uint32_t rs_val = U32(getRegister(rt));
++      uint32_t cr = getCR();
++      for (int i = 0; i < 8; i++) {
++        if (fxm & (0x80 >> i)) {
++          uint32_t shift = 4 * (7 - i);
++          cr = (cr & ~(0xFu << shift)) | (rs_val & (0xFu << shift));
++        }
++      }
++      setCR(cr);
++      break;
++    }
++    case 576: {
++      // mcrxrx: move XER[OV,OV32,CA,CA32] to CR field BF
++      uint32_t bf = instr->bfValue();
++      uint8_t field = 0;
++      if (getXEROV()) field |= 0x8;
++      // OV32 at bit 19 of XER
++      if ((getXER() >> kXEROV32Bit) & 1) field |= 0x4;
++      if (getXERCA()) field |= 0x2;
++      if ((getXER() >> kXERCA32Bit) & 1) field |= 0x1;
++      setCRField(bf, field);
++      break;
++    }
++    case 384:
++    case 416: {
++      // POWER10 setbc/setbcr: RT = (CR[BI]==N) ? 1 : 0
++      // BI at bits 11..15; xo=384 (setbc, N=1), xo=416 (setbcr, N=0).
++      uint32_t bi = instr->raValue();
++      uint32_t crField = bi / 4;
++      uint32_t crBit = bi % 4;
++      uint8_t crFieldVal = getCRField(crField);
++      bool bitSet;
++      switch (crBit) {
++        case 0: bitSet = (crFieldVal & kCRFieldLT) != 0; break;
++        case 1: bitSet = (crFieldVal & kCRFieldGT) != 0; break;
++        case 2: bitSet = (crFieldVal & kCRFieldEQ) != 0; break;
++        case 3: bitSet = (crFieldVal & kCRFieldSO) != 0; break;
++        default: bitSet = false; break;
++      }
++      bool want = (xo == 384) ? bitSet : !bitSet;
++      setRegister(rt, want ? 1 : 0);
++      break;
++    }
++
++    // --- Indexed loads ---
++    case 21: {
++      // ldx: RT = [RA|0 + RB], 8 bytes
++      uint64_t ea = XFormEA(this, instr);
++      setRegister(rt, readDW(ea, instr));
++      break;
++    }
++    case 53: {
++      // ldux: RT = [RA + RB], update RA
++      uint64_t ea = XFormEAUpdate(this, instr);
++      setRegister(rt, readDW(ea, instr));
++      setRegister(ra, ea);
++      break;
++    }
++    case 23: {
++      // lwzx: RT = zero_ext([RA|0 + RB], 4 bytes)
++      uint64_t ea = XFormEA(this, instr);
++      setRegister(rt, U64(readWU(ea, instr)));
++      break;
++    }
++    case 341: {
++      // lwax: RT = sign_ext([RA|0 + RB], 4 bytes)
++      uint64_t ea = XFormEA(this, instr);
++      setRegister(rt, (int64_t)readW(ea, instr));
++      break;
++    }
++    case 87: {
++      // lbzx
++      uint64_t ea = XFormEA(this, instr);
++      setRegister(rt, U64(readBU(ea)));
++      break;
++    }
++    case 279: {
++      // lhzx
++      uint64_t ea = XFormEA(this, instr);
++      setRegister(rt, U64(readHU(ea, instr)));
++      break;
++    }
++    case 343: {
++      // lhax
++      uint64_t ea = XFormEA(this, instr);
++      setRegister(rt, (int64_t)readH(ea, instr));
++      break;
++    }
++    case 535: {
++      // lfsx: load float single indexed, widen to double (NaN-preserving)
++      uint64_t ea = XFormEA(this, instr);
++      if (!handleWasmSegFault(ea, 4)) {
++        float val = *reinterpret_cast<float*>(ea);
++        setFpuRegisterDouble(rt, promoteFloatPreservingNaN(val));
++      }
++      break;
++    }
++    case 599: {
++      // lfdx: load float double indexed
++      uint64_t ea = XFormEA(this, instr);
++      setFpuRegisterDouble(rt, readD(ea, instr));
++      break;
++    }
++    case 855: {
++      // lfiwax: load float as integer word algebraic
++      uint64_t ea = XFormEA(this, instr);
++      int32_t val = readW(ea, instr);
++      setFpuRegister(rt, (int64_t)val);
++      break;
++    }
++    case 887: {
++      // lfiwzx: load float as integer word zero
++      uint64_t ea = XFormEA(this, instr);
++      uint32_t val = readWU(ea, instr);
++      setFpuRegister(rt, (int64_t)(uint64_t)val);
++      break;
++    }
++
++    // --- Indexed stores ---
++    case 149: {
++      // stdx
++      uint64_t ea = XFormEA(this, instr);
++      writeDW(ea, getRegister(rt), instr);
++      break;
++    }
++    case 151: {
++      // stwx
++      uint64_t ea = XFormEA(this, instr);
++      writeW(ea, I32(getRegister(rt)), instr);
++      break;
++    }
++    case 215: {
++      // stbx
++      uint64_t ea = XFormEA(this, instr);
++      writeB(ea, (uint8_t)(getRegister(rt) & 0xFF));
++      break;
++    }
++    case 407: {
++      // sthx
++      uint64_t ea = XFormEA(this, instr);
++      writeH(ea, U16(getRegister(rt)), instr);
++      break;
++    }
++    case 663: {
++      // stfsx: store float single indexed (NaN-preserving)
++      uint64_t ea = XFormEA(this, instr);
++      if (!handleWasmSegFault(ea, 4)) {
++        float fval = demoteDoublePreservingNaN(getFpuRegisterDouble(rt));
++        *reinterpret_cast<float*>(ea) = fval;
++        LLBit_ = false;
++      }
++      break;
++    }
++    case 727: {
++      // stfdx: store float double indexed
++      uint64_t ea = XFormEA(this, instr);
++      writeD(ea, getFpuRegisterDouble(rt), instr);
++      break;
++    }
++
++    // --- Byte-reversed stores ---
++    case 662: {
++      // stwbrx
++      uint64_t ea = XFormEA(this, instr);
++      uint32_t val = U32(getRegister(rt));
++      writeW(ea, (int32_t)__builtin_bswap32(val), instr);
++      break;
++    }
++
++    // --- Atomic load/store ---
++    //
++    // Load-reserve and store-conditional. Sub-word variants
++    // (lbarx/lharx/stbcx./sthcx.) were added in ISA v2.06 (POWER7+).
++    // Word/doubleword variants (lwarx/stwcx./ldarx/stdcx.) go back
++    // to the base ISA.
++    case 52: {
++      // lbarx RT, RA, RB, EH
++      uint64_t ea = XFormEA(this, instr);
++      uint8_t val = loadLinkedB(ea, instr);
++      setRegister(rt, (int64_t)val);
++      break;
++    }
++    case 116: {
++      // lharx RT, RA, RB, EH
++      uint64_t ea = XFormEA(this, instr);
++      uint16_t val = loadLinkedH(ea, instr);
++      setRegister(rt, (int64_t)val);
++      break;
++    }
++    case 694: {
++      // stbcx. RS, RA, RB: always Rc=1.
++      uint64_t ea = XFormEA(this, instr);
++      uint8_t val = uint8_t(getRegister(rt));
++      int result = storeConditionalB(ea, val, instr);
++      if (result) {
++        setCRField(0, kCRFieldEQ | (kCRFieldSO * getXERSO()));
++      } else {
++        setCRField(0, kCRFieldSO * getXERSO());
++      }
++      break;
++    }
++    case 726: {
++      // sthcx. RS, RA, RB: always Rc=1.
++      uint64_t ea = XFormEA(this, instr);
++      uint16_t val = uint16_t(getRegister(rt));
++      int result = storeConditionalH(ea, val, instr);
++      if (result) {
++        setCRField(0, kCRFieldEQ | (kCRFieldSO * getXERSO()));
++      } else {
++        setCRField(0, kCRFieldSO * getXERSO());
++      }
++      break;
++    }
++    case 20: {
++      // lwarx
++      uint64_t ea = XFormEA(this, instr);
++      int32_t val = loadLinkedW(ea, instr);
++      setRegister(rt, (int64_t)val);
++      break;
++    }
++    case 150: {
++      // stwcx.
++      uint64_t ea = XFormEA(this, instr);
++      int32_t val = I32(getRegister(rt));
++      int result = storeConditionalW(ea, val, instr);
++      // stwcx. always updates CR0: EQ if store succeeded, else clear.
++      if (result) {
++        setCRField(0, kCRFieldEQ | (kCRFieldSO * getXERSO()));
++      } else {
++        setCRField(0, kCRFieldSO * getXERSO());
++      }
++      break;
++    }
++    case 84: {
++      // ldarx
++      uint64_t ea = XFormEA(this, instr);
++      int64_t val = loadLinkedD(ea, instr);
++      setRegister(rt, val);
++      break;
++    }
++    case 214: {
++      // stdcx.
++      uint64_t ea = XFormEA(this, instr);
++      int64_t val = getRegister(rt);
++      int result = storeConditionalD(ea, val, instr);
++      if (result) {
++        setCRField(0, kCRFieldEQ | (kCRFieldSO * getXERSO()));
++      } else {
++        setCRField(0, kCRFieldSO * getXERSO());
++      }
++      break;
++    }
++
++    // --- Synchronization ---
++    case 598:
++      // sync / lwsync / ptesync: no-op in simulator
++      break;
++    case 854:
++      // eieio: no-op in simulator
++      break;
++
++    // --- GPR <-> VSR move (major opcode 31, XX1-form) ---
++    //
++    // Two sub-encodings:
++    //   mtvsr* XT,RA{,RB}: XX1Form — XT at bits 25:21 (5) + TX at bit 0 (1);
++    //                      RA at bits 20:16; RB (if any) at bits 15:11.
++    //   mfvsr* RA,XS:      XX1FormMfvsr — XS at bits 25:21 (5) + SX at bit 0 (1);
++    //                      RA (GPR dest) at bits 20:16.
++    //
++    // The original decoder treated "rsValue()" (bits 25:21 = VSR field) as a
++    // GPR index — doubly wrong: the GPR side lives at bits 20:16 (= raValue())
++    // and the VSR side is 6 bits (5-bit field + extension bit at bit 0). Fixed
++    // here and extended for the full VSR namespace (0-63).
++    // The ISA names each field in BE. "XT.DW0" is the BE doubleword which on
++    // PPC64LE register storage lives at LE bytes 8-15 (our bytes[] is LE-natural:
++    // bytes[0] = lowest address). With `mtvsrd / mfvsrd / mtvsrdd / mfvsrld
++    // / stxvx`: mtvsrd of 0x1122334455667788 produces `00 00 00 00 00 00 00 00
++    // 88 77 66 55 44 33 22 11` in memory (LE bytes 8-15 hold the GPR bits with
++    // LSB at byte 8). Matching semantics here means the sim respects
++    // the full Power ISA, not a self-consistent LE-reversed
++    // convention.
++    case 51: {
++      // mfvsrd RA, XS: GPR[RA] = XS.DW0 = LE bytes 8..15.
++      int xs = int(instr->rtValue() | (instr->bit(0) << 5));  // T + SX(TX)
++      uint8_t bytes[16];
++      getVSR128(xs, bytes);
++      int64_t val;
++      memcpy(&val, bytes + 8, 8);
++      setRegister(instr->raValue(), val);
++      break;
++    }
++    case 211: {
++      // mtvsrwa XT, RA: XT.DW0 = sign_ext_64(RA[32:63]); XT.DW1 = 0.
++      // POWER8+ (ISA 2.07). Combines extsw + mtvsrd. LE layout: bytes
++      // 8-15 ← sign-extended low 32 of RA; bytes 0-7 ← 0.
++      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++      uint8_t bytes[16];
++      int64_t val = (int64_t)(int32_t)getRegister(instr->raValue());
++      memset(bytes, 0, 8);
++      memcpy(bytes + 8, &val, 8);
++      setVSR128(xt, bytes);
++      break;
++    }
++    case 179: {
++      // mtvsrd XT, RA: XT.DW0 = RA; XT.DW1 = 0.
++      // LE layout: bytes 8-15 ← RA, bytes 0-7 ← 0.
++      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++      uint8_t bytes[16];
++      int64_t val = getRegister(instr->raValue());
++      memset(bytes, 0, 8);
++      memcpy(bytes + 8, &val, 8);
++      setVSR128(xt, bytes);
++      break;
++    }
++    case 243: {
++      // mtvsrwz XT, RA: XT.DW0 = zero_ext(RA[32:63]); XT.DW1 = 0.
++      // The 32-bit value lives in the low 32 bits of DW0 = BE word 1,
++      // which on LE storage is LE bytes 8..11 (LE word 2); LE bytes
++      // 12..15 = 0 (upper half of DW0 = BE word 0 = zero-extended).
++      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++      uint8_t bytes[16];
++      uint32_t lo = U32(getRegister(instr->raValue()));
++      memset(bytes, 0, 16);
++      bytes[8]  = (uint8_t)(lo);
++      bytes[9]  = (uint8_t)(lo >> 8);
++      bytes[10] = (uint8_t)(lo >> 16);
++      bytes[11] = (uint8_t)(lo >> 24);
++      setVSR128(xt, bytes);
++      break;
++    }
++    case 307: {
++      // mfvsrld RA, XS: GPR[RA] = XS.DW1 = LE bytes 0..7.
++      // POWER9.
++      int xs = int(instr->rtValue() | (instr->bit(0) << 5));
++      uint8_t bytes[16];
++      getVSR128(xs, bytes);
++      int64_t val;
++      memcpy(&val, bytes, 8);
++      setRegister(instr->raValue(), val);
++      break;
++    }
++    case 403: {
++      // mtvsrws XT, RA (POWER9): splat low 32 bits of RA into all four
++      // word elements of XT. The same 32-bit value appears in lanes 0..3,
++      // so the byte layout is identical in LE and BE —
++      // bytes 0..15 = lo | lo | lo | lo.
++      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++      uint8_t bytes[16];
++      uint32_t lo = U32(getRegister(instr->raValue()));
++      uint64_t val = ((uint64_t)lo << 32) | lo;
++      memcpy(bytes, &val, 8);
++      memcpy(bytes + 8, &val, 8);
++      setVSR128(xt, bytes);
++      break;
++    }
++    case 435: {
++      // mtvsrdd XT, RA, RB: XT.DW0 = RA; XT.DW1 = RB. POWER9.
++      // LE: bytes 8-15 ← RA, bytes 0-7 ← RB.
++      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++      uint8_t bytes[16];
++      int64_t dw0 = getRegister(instr->raValue());
++      int64_t dw1 = getRegister(instr->rbValue());
++      memcpy(bytes,     &dw1, 8);
++      memcpy(bytes + 8, &dw0, 8);
++      setVSR128(xt, bytes);
++      break;
++    }
++
++    // --- VMX vector memory (major opcode 31) ---
++    //
++    // lvx / stvx / lvxl / stvxl.
++    //   EA = (RA|0) + RB; EA = EA & ~0xF (alignment)
++    //   lvx:  VRT[0:127] <- MEM(EA, 16)       bytes[0] = *(EA+0)
++    //   stvx: MEM(EA, 16) <- VRS[0:127]       *(EA+0) = bytes[0]
++    // lvxl / stvxl are identical in effect to lvx / stvx (the "l" form
++    // hints "least recently used"; semantically indistinguishable).
++    case 103: {
++      // lvx: VRT = MEM(EA & ~0xF, 16 bytes)
++      uint64_t ea = XFormEA(this, instr) & ~uint64_t(0xF);
++      if (handleWasmSegFault(ea, 16)) break;
++      memcpy(VRregisters_[rt], reinterpret_cast<const void*>(ea), 16);
++      break;
++    }
++    case 231: {
++      // stvx: MEM(EA & ~0xF, 16 bytes) = VRS
++      uint64_t ea = XFormEA(this, instr) & ~uint64_t(0xF);
++      if (handleWasmSegFault(ea, 16)) break;
++      memcpy(reinterpret_cast<void*>(ea), VRregisters_[rt], 16);
++      break;
++    }
++    case 359: {
++      // lvxl: semantically identical to lvx
++      uint64_t ea = XFormEA(this, instr) & ~uint64_t(0xF);
++      if (handleWasmSegFault(ea, 16)) break;
++      memcpy(VRregisters_[rt], reinterpret_cast<const void*>(ea), 16);
++      break;
++    }
++    case 487: {
++      // stvxl: semantically identical to stvx
++      uint64_t ea = XFormEA(this, instr) & ~uint64_t(0xF);
++      if (handleWasmSegFault(ea, 16)) break;
++      memcpy(reinterpret_cast<void*>(ea), VRregisters_[rt], 16);
++      break;
++    }
++
++    // --- VSX vector memory indexed (major opcode 31) ---
++    //
++    // These ops take a 6-bit VSR register,
++    // encoded as 5-bit T/S + 1-bit TX/SX extension at instruction LSB
++    // bit 0 (= our instr->bit(0)). EA = (RA|0) + RB. 16-byte access,
++    // not forced-aligned (hardware may handle misaligned via sub-access
++    // or alignment interrupt per impl).
++    //
++    // Byte-order note: lxvx/stxvx perform a natural 16-byte LE
++    // memcpy. lxvd2x/stxvd2x on real PPC64 LE hardware load/store
++    // doublewords in BE-pair order — i.e. lxvd2x places memory bytes
++    // 0-7 in the register's BE-DW0 (= LE bytes 8-15) and bytes 8-15
++    // in BE-DW1 (= LE bytes 0-7). The JIT brackets every wasm SIMD
++    // load/store with a compensating xxpermdi DM=2 so the net effect
++    // is a natural LE byte order. The constant pool emits the same
++    // lxvd2x + xxpermdi sequence (per PatchConstantPoolLoad) but
++    // assumes the hardware semantics, not a plain memcpy. So the sim
++    // must match real-hardware lxvd2x/stxvd2x semantics including the
++    // BE-DW byte order — otherwise the post-load xxpermdi unswaps
++    // bytes that were never swapped, and constant-pool Simd128 loads
++    // (e.g. shuffle masks) come out with halves transposed.
++    case 268: {
++      // lxvx: XT = MEM((RA|0)+RB, 16)
++      uint64_t ea = XFormEA(this, instr);
++      if (handleWasmSegFault(ea, 16)) break;
++      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++      uint8_t buf[16];
++      memcpy(buf, reinterpret_cast<const void*>(ea), 16);
++      setVSR128(xt, buf);
++      break;
++    }
++    case 396: {
++      // stxvx: MEM((RA|0)+RB, 16) = XS
++      uint64_t ea = XFormEA(this, instr);
++      if (handleWasmSegFault(ea, 16)) break;
++      int xs = int(instr->rtValue() | (instr->bit(0) << 5));
++      uint8_t buf[16];
++      getVSR128(xs, buf);
++      memcpy(reinterpret_cast<void*>(ea), buf, 16);
++      break;
++    }
++    case 813: {
++      // lxsihzx XT, RA, RB: P9 (ISA 3.0). Load halfword to VSR & zero,
++      // indexed. MEM(EA, 2) (LE-natural halfword) is placed in dw[0]
++      // low 16 bits; the rest of the VSR is zeroed. In sim LE-byte
++      // storage, that is bytes[8..9] (low byte at bytes[8]).
++      uint64_t ea = XFormEA(this, instr);
++      if (handleWasmSegFault(ea, 2)) break;
++      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++      uint16_t halfword = readH(ea, instr);
++      uint8_t buf[16];
++      memset(buf, 0, 16);
++      buf[8] = (uint8_t)(halfword & 0xFF);
++      buf[9] = (uint8_t)((halfword >> 8) & 0xFF);
++      setVSR128(xt, buf);
++      break;
++    }
++    case 941: {
++      // stxsihx XS, RA, RB: P9 (ISA 3.0). Store halfword from VSR,
++      // indexed. dw[0] low 16 bits (sim bytes[8..9] in host-LE order)
++      // are written as a halfword at MEM(EA, 2).
++      uint64_t ea = XFormEA(this, instr);
++      if (handleWasmSegFault(ea, 2)) break;
++      int xs = int(instr->rtValue() | (instr->bit(0) << 5));
++      uint8_t buf[16];
++      getVSR128(xs, buf);
++      uint16_t halfword =
++          (uint16_t)buf[8] | ((uint16_t)buf[9] << 8);
++      writeH(ea, halfword, instr);
++      break;
++    }
++    case 844: {
++      // lxvd2x: XT = MEM((RA|0)+RB, 16) with BE-DW byte ordering.
++      // Memory bytes 0-7 land in BE-DW0 (= LE bytes 8-15); memory
++      // bytes 8-15 land in BE-DW1 (= LE bytes 0-7).
++      uint64_t ea = XFormEA(this, instr);
++      if (handleWasmSegFault(ea, 16)) break;
++      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++      uint8_t mem[16], buf[16];
++      memcpy(mem, reinterpret_cast<const void*>(ea), 16);
++      memcpy(buf, mem + 8, 8);
++      memcpy(buf + 8, mem, 8);
++      setVSR128(xt, buf);
++      break;
++    }
++    case 972: {
++      // stxvd2x: MEM((RA|0)+RB, 16) = XS with BE-DW byte ordering.
++      // Inverse of lxvd2x: register LE bytes 0-7 → memory bytes 8-15;
++      // LE bytes 8-15 → memory bytes 0-7.
++      uint64_t ea = XFormEA(this, instr);
++      if (handleWasmSegFault(ea, 16)) break;
++      int xs = int(instr->rtValue() | (instr->bit(0) << 5));
++      uint8_t buf[16], mem[16];
++      getVSR128(xs, buf);
++      memcpy(mem, buf + 8, 8);
++      memcpy(mem + 8, buf, 8);
++      memcpy(reinterpret_cast<void*>(ea), mem, 16);
++      break;
++    }
++
++    default:
++      MOZ_CRASH_UNSAFE_PRINTF(
++          "decodeXForm: unimplemented XO=%u (instruction 0x%08x)", xo,
++          instr->instructionBits());
++  }
++}
++
++// -----------------------------------------------------------------------------
++// decodeRotateMask: rlwinm(21), rlwnm(23), rlwimi(20),
++//   rldicl(30), rldicr(30), rldic(30), rldimi(30), rldcl(30), rldcr(30)
++
++void Simulator::decodeRotateMask(SimInstruction* instr) {
++  uint32_t opcode = instr->opcode();
++
++  if (opcode == 21) {
++    // rlwinm: RA = ROTL32(RS,SH) & MASK(MB,ME), Rc
++    uint32_t rs_val = U32(getRegister(instr->rsValue()));
++    uint32_t sh = instr->mSHValue();
++    uint32_t mb = instr->mMBValue();
++    uint32_t me = instr->mMEValue();
++    uint32_t rotated = RotateLeft32(rs_val, sh);
++    uint32_t mask = MASK32(mb, me);
++    int64_t result = (int64_t)(uint64_t)(rotated & mask);
++    setRegister(instr->raValue(), result);
++    if (instr->rcBit()) updateCR0(result);
++  } else if (opcode == 23) {
++    // rlwnm: RA = ROTL32(RS,RB[27:31]) & MASK(MB,ME), Rc
++    uint32_t rs_val = U32(getRegister(instr->rsValue()));
++    uint32_t sh = U32(getRegister(instr->rbValue())) & 0x1F;
++    uint32_t mb = instr->mMBValue();
++    uint32_t me = instr->mMEValue();
++    uint32_t rotated = RotateLeft32(rs_val, sh);
++    uint32_t mask = MASK32(mb, me);
++    int64_t result = (int64_t)(uint64_t)(rotated & mask);
++    setRegister(instr->raValue(), result);
++    if (instr->rcBit()) updateCR0(result);
++  } else if (opcode == 20) {
++    // rlwimi: RA = (ROTL32(RS,SH) & MASK) | (RA & ~MASK), Rc
++    uint32_t rs_val = U32(getRegister(instr->rsValue()));
++    uint32_t sh = instr->mSHValue();
++    uint32_t mb = instr->mMBValue();
++    uint32_t me = instr->mMEValue();
++    uint32_t rotated = RotateLeft32(rs_val, sh);
++    uint32_t mask = MASK32(mb, me);
++    uint32_t ra_val = U32(getRegister(instr->raValue()));
++    int64_t result = (int64_t)(uint64_t)((rotated & mask) | (ra_val & ~mask));
++    setRegister(instr->raValue(), result);
++    if (instr->rcBit()) updateCR0(result);
++  } else if (opcode == 30) {
++    // MD-form / MDS-form: 64-bit rotate/mask
++    uint32_t rs = instr->rsValue();
++    uint64_t rs_val = U64(getRegister(rs));
++    uint32_t ra_reg = instr->raValue();
++
++    // Determine which sub-opcode: bits 2..4 for MD-form, bit 4 for MDS.
++    // MD: bits 2..4
++    // MDS: bit 4 (rldcl has bit4=0, bit3..2=00 with bit1=1; rldcr has
++    //      bit4=0, bit3..2=01 with bit1=1). Actually:
++    //   rldicl:  30 | MD-XO=0 (bits 2..4 = 000), bit1=0
++    //   rldicr:  30 | MD-XO=1 (bits 2..4 = 001), bit1=0
++    //   rldic:   30 | MD-XO=2 (bits 2..4 = 010), bit1=0
++    //   rldimi:  30 | MD-XO=3 (bits 2..4 = 011), bit1=0
++    //   rldcl:   30 | MDS, bit4=0, bit3..1=000, bit0=Rc => bits 1..4=1000
++    //            Actually rldcl: bits 1..4 = 1000, i.e. bit(4)=1,bit(3)=0,
++    //            bit(2)=0,bit(1)=0
++    //   rldcr:   30 | MDS, bits 1..4 = 1001
++    //
++    // Let's check bit 4 first: if bit(4)==1, it's MDS-form (rldcl/rldcr).
++    if (instr->bit(4)) {
++      // MDS-form: shift amount from RB register
++      uint32_t sh = U32(getRegister(instr->rbValue())) & 0x3F;
++      uint64_t rotated = RotateLeft64(rs_val, sh);
++      uint32_t mb = instr->mdsMBValue();
++
++      if (!instr->bit(1)) {
++        // rldcl: RA = ROTL64(RS, RB[58:63]) & MASK(mb, 63)
++        uint64_t mask = MASK64(mb, 63);
++        int64_t result = I64(rotated & mask);
++        setRegister(ra_reg, result);
++        if (instr->rcBit()) updateCR0(result);
++      } else {
++        // rldcr: RA = ROTL64(RS, RB[58:63]) & MASK(0, me)
++        uint32_t me = instr->mdsMBValue();
++        uint64_t mask = MASK64(0, me);
++        int64_t result = I64(rotated & mask);
++        setRegister(ra_reg, result);
++        if (instr->rcBit()) updateCR0(result);
++      }
++    } else {
++      // MD-form
++      uint32_t sh = instr->mdSHValue();
++      uint64_t rotated = RotateLeft64(rs_val, sh);
++      uint32_t xo_md = instr->bits(3, 2);
++
++      switch (xo_md) {
++        case 0: {
++          // rldicl: RA = ROTL64(RS, SH) & MASK(mb, 63)
++          uint32_t mb = instr->mdMBValue();
++          uint64_t mask = MASK64(mb, 63);
++          int64_t result = I64(rotated & mask);
++          setRegister(ra_reg, result);
++          if (instr->rcBit()) updateCR0(result);
++          break;
++        }
++        case 1: {
++          // rldicr: RA = ROTL64(RS, SH) & MASK(0, me)
++          uint32_t me = instr->mdMEValue();
++          uint64_t mask = MASK64(0, me);
++          int64_t result = I64(rotated & mask);
++          setRegister(ra_reg, result);
++          if (instr->rcBit()) updateCR0(result);
++          break;
++        }
++        case 2: {
++          // rldic: RA = ROTL64(RS, SH) & MASK(mb, ~SH)
++          // Actually: MASK(mb, 63-SH)
++          uint32_t mb = instr->mdMBValue();
++          uint64_t mask = MASK64(mb, 63 - sh);
++          int64_t result = I64(rotated & mask);
++          setRegister(ra_reg, result);
++          if (instr->rcBit()) updateCR0(result);
++          break;
++        }
++        case 3: {
++          // rldimi: RA = (ROTL64(RS,SH) & MASK) | (RA & ~MASK)
++          uint32_t mb = instr->mdMBValue();
++          uint64_t mask = MASK64(mb, 63 - sh);
++          uint64_t ra_val = U64(getRegister(ra_reg));
++          int64_t result = I64((rotated & mask) | (ra_val & ~mask));
++          setRegister(ra_reg, result);
++          if (instr->rcBit()) updateCR0(result);
++          break;
++        }
++        default:
++          MOZ_CRASH_UNSAFE_PRINTF("decodeRotateMask: MD xo=%u", xo_md);
++      }
++    }
++  } else {
++    MOZ_CRASH_UNSAFE_PRINTF("decodeRotateMask: opcode=%u", opcode);
++  }
++}
++
++// -----------------------------------------------------------------------------
++// CR-bit accessors used by the XL-form CR-logic ops (crand, crandc, cror,
++// crorc, crxor, creqv). Bit index is in BIF*4+x form: field=b/4, bit=b%4
++// where 0=LT, 1=GT, 2=EQ, 3=SO.
++static inline uint8_t CRBitMask(uint32_t bitInField) {
++  switch (bitInField) {
++    case 0: return kCRFieldLT;
++    case 1: return kCRFieldGT;
++    case 2: return kCRFieldEQ;
++    case 3: return kCRFieldSO;
++  }
++  return 0;
++}
++
++static inline bool GetCRBit(Simulator& s, uint32_t b) {
++  return (s.getCRField(b / 4) & CRBitMask(b % 4)) != 0;
++}
++
++static inline void SetCRBit(Simulator& s, uint32_t b, bool val) {
++  uint8_t fv = s.getCRField(b / 4);
++  uint8_t mask = CRBitMask(b % 4);
++  s.setCRField(b / 4, val ? (fv | mask) : (fv & ~mask));
++}
++
++// -----------------------------------------------------------------------------
++// decodeBranch: b(18), bc(16), XL-form(19)
++
++void Simulator::decodeBranch(SimInstruction* instr) {
++  uint32_t opcode = instr->opcode();
++
++  if (opcode == 18) {
++    // b / bl: I-form unconditional branch
++    int32_t offset = instr->li26Value();
++    bool lk = instr->lkBit();
++    bool aa = instr->aaBit();
++
++    int64_t target;
++    if (aa) {
++      target = (int64_t)offset;
++    } else {
++      target = get_pc() + (int64_t)offset;
++    }
++
++    if (lk) {
++      setLR(get_pc() + SimInstruction::kInstrSize);
++    }
++
++    set_pc(target);
++    return;
++  }
++
++  if (opcode == 16) {
++    // bc / bcl: B-form conditional branch
++    uint32_t bo = instr->boValue();
++    uint32_t bi = instr->biValue();
++    int32_t bd = instr->bd16Value();
++    bool lk = instr->lkBit();
++    bool aa = instr->aaBit();
++
++    // Decrement CTR if BO[2] (bit 2 of BO, which is bo & 0x04) is clear.
++    if (!(bo & 0x04)) {
++      setCTR(getCTR() - 1);
++    }
++
++    // Evaluate CTR condition.
++    bool ctr_ok = (bo & 0x04) ||
++                  ((getCTR() != 0) ^ ((bo & 0x02) != 0));
++
++    // Evaluate CR condition.
++    uint32_t crField = bi / 4;
++    uint32_t crBit = bi % 4;
++    uint8_t crFieldVal = getCRField(crField);
++    bool crBitSet;
++    switch (crBit) {
++      case 0: crBitSet = (crFieldVal & kCRFieldLT) != 0; break;
++      case 1: crBitSet = (crFieldVal & kCRFieldGT) != 0; break;
++      case 2: crBitSet = (crFieldVal & kCRFieldEQ) != 0; break;
++      case 3: crBitSet = (crFieldVal & kCRFieldSO) != 0; break;
++      default: crBitSet = false; break;
++    }
++    bool cond_ok = (bo & 0x10) || (crBitSet == ((bo & 0x08) != 0));
++
++    if (ctr_ok && cond_ok) {
++      int64_t target;
++      if (aa) {
++        target = (int64_t)bd;
++      } else {
++        target = get_pc() + (int64_t)bd;
++      }
++      if (lk) {
++        setLR(get_pc() + SimInstruction::kInstrSize);
++      }
++      set_pc(target);
++    } else {
++      // Branch not taken.
++      set_pc(get_pc() + SimInstruction::kInstrSize);
++    }
++    return;
++  }
++
++  if (opcode == 19) {
++    // XL-form: bclr, bcctr, crand, crandc, cror, crorc, crxor, creqv,
++    //          mcrf, isync
++    uint32_t xl = instr->xlValue();
++
++    switch (xl) {
++      case 16: {
++        // bclr: conditional branch to LR
++        uint32_t bo = instr->boValue();
++        uint32_t bi = instr->biValue();
++        bool lk = instr->lkBit();
++
++        if (!(bo & 0x04)) {
++          setCTR(getCTR() - 1);
++        }
++
++        bool ctr_ok = (bo & 0x04) ||
++                      ((getCTR() != 0) ^ ((bo & 0x02) != 0));
++
++        uint32_t crField = bi / 4;
++        uint32_t crBit = bi % 4;
++        uint8_t crFieldVal = getCRField(crField);
++        bool crBitSet;
++        switch (crBit) {
++          case 0: crBitSet = (crFieldVal & kCRFieldLT) != 0; break;
++          case 1: crBitSet = (crFieldVal & kCRFieldGT) != 0; break;
++          case 2: crBitSet = (crFieldVal & kCRFieldEQ) != 0; break;
++          case 3: crBitSet = (crFieldVal & kCRFieldSO) != 0; break;
++          default: crBitSet = false; break;
++        }
++        bool cond_ok = (bo & 0x10) || (crBitSet == ((bo & 0x08) != 0));
++
++        if (ctr_ok && cond_ok) {
++          int64_t target = getLR() & ~3LL;
++          if (lk) {
++            setLR(get_pc() + SimInstruction::kInstrSize);
++          }
++          set_pc(target);
++        } else {
++          set_pc(get_pc() + SimInstruction::kInstrSize);
++        }
++        break;
++      }
++      case 528: {
++        // bcctr: conditional branch to CTR
++        uint32_t bo = instr->boValue();
++        uint32_t bi = instr->biValue();
++        bool lk = instr->lkBit();
++
++        // CTR is not decremented for bcctr.
++        uint32_t crField = bi / 4;
++        uint32_t crBit = bi % 4;
++        uint8_t crFieldVal = getCRField(crField);
++        bool crBitSet;
++        switch (crBit) {
++          case 0: crBitSet = (crFieldVal & kCRFieldLT) != 0; break;
++          case 1: crBitSet = (crFieldVal & kCRFieldGT) != 0; break;
++          case 2: crBitSet = (crFieldVal & kCRFieldEQ) != 0; break;
++          case 3: crBitSet = (crFieldVal & kCRFieldSO) != 0; break;
++          default: crBitSet = false; break;
++        }
++        bool cond_ok = (bo & 0x10) || (crBitSet == ((bo & 0x08) != 0));
++
++        if (cond_ok) {
++          int64_t target = getCTR() & ~3LL;
++          if (lk) {
++            setLR(get_pc() + SimInstruction::kInstrSize);
++          }
++          set_pc(target);
++        } else {
++          set_pc(get_pc() + SimInstruction::kInstrSize);
++        }
++        break;
++      }
++      case 257: {
++        // crand: CR[BT] = CR[BA] & CR[BB]
++        uint32_t bt = instr->rtValue();
++        uint32_t ba = instr->raValue();
++        uint32_t bb = instr->rbValue();
++        SetCRBit(*this, bt, GetCRBit(*this, ba) && GetCRBit(*this, bb));
++        break;
++      }
++      case 129: {
++        // crandc: CR[BT] = CR[BA] & ~CR[BB]
++        uint32_t bt = instr->rtValue();
++        uint32_t ba = instr->raValue();
++        uint32_t bb = instr->rbValue();
++        SetCRBit(*this, bt, GetCRBit(*this, ba) && !GetCRBit(*this, bb));
++        break;
++      }
++      case 449: {
++        // cror: CR[BT] = CR[BA] | CR[BB]
++        uint32_t bt = instr->rtValue();
++        uint32_t ba = instr->raValue();
++        uint32_t bb = instr->rbValue();
++        SetCRBit(*this, bt, GetCRBit(*this, ba) || GetCRBit(*this, bb));
++        break;
++      }
++      case 417: {
++        // crorc: CR[BT] = CR[BA] | ~CR[BB]
++        uint32_t bt = instr->rtValue();
++        uint32_t ba = instr->raValue();
++        uint32_t bb = instr->rbValue();
++        SetCRBit(*this, bt, GetCRBit(*this, ba) || !GetCRBit(*this, bb));
++        break;
++      }
++      case 193: {
++        // crxor: CR[BT] = CR[BA] ^ CR[BB]
++        uint32_t bt = instr->rtValue();
++        uint32_t ba = instr->raValue();
++        uint32_t bb = instr->rbValue();
++        SetCRBit(*this, bt, GetCRBit(*this, ba) ^ GetCRBit(*this, bb));
++        break;
++      }
++      case 289: {
++        // creqv: CR[BT] = ~(CR[BA] ^ CR[BB])
++        uint32_t bt = instr->rtValue();
++        uint32_t ba = instr->raValue();
++        uint32_t bb = instr->rbValue();
++        SetCRBit(*this, bt, !(GetCRBit(*this, ba) ^ GetCRBit(*this, bb)));
++        break;
++      }
++      case 150: {
++        // isync: no-op in simulator
++        break;
++      }
++      case 370: {
++        // PPC_stop (0x4C0002E4) decoded as XL-form opcode 19, XL=370.
++        // This is our kCallRedirInstr. Handle via softwareInterrupt.
++        softwareInterrupt(instr);
++        break;
++      }
++      case 2: {
++        // POWER9 addpcis rT, D (DX-form). Computes rT = (CIA + 4) +
++        // (sext16(D) << 16). The 16-bit signed displacement D is split
++        // across three sub-fields:
++        //   d0 = bits LE 6..15 (10 bits) — D[15:6]
++        //   d1 = bits LE 16..20 (5 bits)  — D[5:1]
++        //   d2 = bit  LE 0      (1 bit)   — D[0]
++        // (Mirrors the encoder in Assembler-ppc64.cpp:as_addpcis.)
++        uint32_t rt = instr->rtValue();
++        uint32_t d0 = instr->bits(15, 6);
++        uint32_t d1 = instr->bits(20, 16);
++        uint32_t d2 = instr->bit(0);
++        int16_t D = (int16_t)((d0 << 6) | (d1 << 1) | d2);
++        int64_t cia = reinterpret_cast<int64_t>(instr);
++        setRegister(rt, cia + SimInstruction::kInstrSize +
++                            (static_cast<int64_t>(D) << 16));
++        break;
++      }
++      default:
++        MOZ_CRASH_UNSAFE_PRINTF("decodeBranch: XL opcode 19, xl=%u", xl);
++    }
++    return;
++  }
++
++  MOZ_CRASH_UNSAFE_PRINTF("decodeBranch: opcode=%u", opcode);
++}
++
++// -----------------------------------------------------------------------------
++// decodeFP: Major opcodes 59 (A-form single) and 63 (X-form / A-form double)
++
++void Simulator::decodeFP(SimInstruction* instr) {
++  uint32_t opcode = instr->opcode();
++  uint32_t rt = instr->rtValue();  // FRT
++  uint32_t ra = instr->raValue();  // FRA
++  uint32_t rb = instr->rbValue();  // FRB
++  uint32_t rc_reg = instr->rcValue();  // FRC (A-form)
++
++  if (opcode == 63) {
++    // X-form and A-form double-precision instructions.
++    // For A-form, the sub-opcode is in bits 1..5.
++    // For X-form, the sub-opcode is in bits 1..10.
++    uint32_t xo_a = instr->bits(5, 1);  // A-form sub-opcode
++    uint32_t xo_x = instr->bits(10, 1); // X-form sub-opcode
++
++    // Try A-form first (5-bit sub-opcode in bits 1..5).
++    switch (xo_a) {
++      case 21: {
++        // fadd
++        double result = getFpuRegisterDouble(ra) + getFpuRegisterDouble(rb);
++        setFpuRegisterDouble(rt, result);
++        return;
++      }
++      case 20: {
++        // fsub
++        double result = getFpuRegisterDouble(ra) - getFpuRegisterDouble(rb);
++        setFpuRegisterDouble(rt, result);
++        return;
++      }
++      case 25: {
++        // fmul: FRT = FRA * FRC (note: FRC, not FRB!)
++        double result = getFpuRegisterDouble(ra) * getFpuRegisterDouble(rc_reg);
++        setFpuRegisterDouble(rt, result);
++        return;
++      }
++      case 18: {
++        // fdiv
++        double result = getFpuRegisterDouble(ra) / getFpuRegisterDouble(rb);
++        setFpuRegisterDouble(rt, result);
++        return;
++      }
++      case 22: {
++        // fsqrt
++        double result = sqrt(getFpuRegisterDouble(rb));
++        setFpuRegisterDouble(rt, result);
++        return;
++      }
++      case 29: {
++        // fmadd: FRT = FRA * FRC + FRB
++        double result = std::fma(getFpuRegisterDouble(ra),
++                                 getFpuRegisterDouble(rc_reg),
++                                 getFpuRegisterDouble(rb));
++        setFpuRegisterDouble(rt, result);
++        return;
++      }
++      case 30: {
++        // fnmsub: FRT = -(FRA * FRC - FRB)
++        double result = -(std::fma(getFpuRegisterDouble(ra),
++                                   getFpuRegisterDouble(rc_reg),
++                                   -getFpuRegisterDouble(rb)));
++        setFpuRegisterDouble(rt, result);
++        return;
++      }
++      case 28: {
++        // fmsub: FRT = FRA * FRC - FRB
++        double result = std::fma(getFpuRegisterDouble(ra),
++                                 getFpuRegisterDouble(rc_reg),
++                                 -getFpuRegisterDouble(rb));
++        setFpuRegisterDouble(rt, result);
++        return;
++      }
++      case 31: {
++        // fnmadd: FRT = -(FRA * FRC + FRB)
++        double result = -(std::fma(getFpuRegisterDouble(ra),
++                                   getFpuRegisterDouble(rc_reg),
++                                   getFpuRegisterDouble(rb)));
++        setFpuRegisterDouble(rt, result);
++        return;
++      }
++      case 23: {
++        // fsel: FRT = (FRA >= 0) ? FRC : FRB
++        double fra = getFpuRegisterDouble(ra);
++        setFpuRegisterDouble(rt, (fra >= 0.0) ? getFpuRegisterDouble(rc_reg)
++                                               : getFpuRegisterDouble(rb));
++        return;
++      }
++      case 26: {
++        // frsqrte: FRT = 1.0 / sqrt(FRB) (estimate)
++        double result = 1.0 / sqrt(getFpuRegisterDouble(rb));
++        setFpuRegisterDouble(rt, result);
++        return;
++      }
++    }
++
++    // X-form (10-bit sub-opcode).
++    switch (xo_x) {
++      case 72: {
++        // fmr: FRT = FRB
++        setFpuRegisterDouble(rt, getFpuRegisterDouble(rb));
++        break;
++      }
++      case 40: {
++        // fneg: FRT = -FRB
++        setFpuRegisterDouble(rt, -getFpuRegisterDouble(rb));
++        break;
++      }
++      case 264: {
++        // fabs: FRT = |FRB|
++        setFpuRegisterDouble(rt, fabs(getFpuRegisterDouble(rb)));
++        break;
++      }
++      case 136: {
++        // fnabs: FRT = -|FRB|
++        setFpuRegisterDouble(rt, -fabs(getFpuRegisterDouble(rb)));
++        break;
++      }
++      case 8: {
++        // fcpsgn: FRT = sign(FRA) || magnitude(FRB)
++        double fra = getFpuRegisterDouble(ra);
++        double frb = getFpuRegisterDouble(rb);
++        setFpuRegisterDouble(rt, std::copysign(frb, fra));
++        break;
++      }
++      case 0: {
++        // fcmpu: compare FRA, FRB unordered
++        uint32_t bf = instr->bfValue();
++        double fra = getFpuRegisterDouble(ra);
++        double frb = getFpuRegisterDouble(rb);
++        uint8_t field = 0;
++        if (std::isnan(fra) || std::isnan(frb)) {
++          field = kCRFieldSO;
++        } else if (fra < frb) {
++          field = kCRFieldLT;
++        } else if (fra > frb) {
++          field = kCRFieldGT;
++        } else {
++          field = kCRFieldEQ;
++        }
++        setCRField(bf, field);
++        break;
++      }
++      case 32: {
++        // fcmpo: compare FRA, FRB ordered
++        uint32_t bf = instr->bfValue();
++        double fra = getFpuRegisterDouble(ra);
++        double frb = getFpuRegisterDouble(rb);
++        uint8_t field = 0;
++        if (std::isnan(fra) || std::isnan(frb)) {
++          field = kCRFieldSO;
++        } else if (fra < frb) {
++          field = kCRFieldLT;
++        } else if (fra > frb) {
++          field = kCRFieldGT;
++        } else {
++          field = kCRFieldEQ;
++        }
++        setCRField(bf, field);
++        break;
++      }
++      // For fctid* and fctiw* the ISA specifies that bit 23 of FPSCR (VXCVI,
++      // "invalid op for integer convert") is set when the source is NaN, +Inf,
++      // -Inf, or out of the destination's range. Wasm's out-of-range trap
++      // sequence is `mtfsb0 23; fctidz; mfvsrd; mcrfs cr0,5; bt SOBit,trap`,
++      // so the simulator MUST update VXCVI here for the trap to fire. With
++      // FPSCR_ in the low-half PPC layout (PPC bit N → int64 bit (31-N)),
++      // VXCVI lives at int64 bit (31-23) = 8.
++      case 814: {
++        // fctid: convert double to int64 (current rounding)
++        double frb = getFpuRegisterDouble(rb);
++        int64_t result;
++        bool invalid = false;
++        if (std::isnan(frb)) {
++          result = INT64_MIN;
++          invalid = true;
++        } else if (frb >= -(double)INT64_MIN || frb < (double)INT64_MIN) {
++          result = (frb < 0) ? INT64_MIN : INT64_MAX;
++          invalid = true;
++        } else {
++          switch (FPSCR_ & kFPSCRRNMask) {
++            case RN: result = (int64_t)llrint(frb); break;
++            case RZ: result = (int64_t)frb; break;
++            case RP: result = (int64_t)ceil(frb); break;
++            case RM: result = (int64_t)floor(frb); break;
++            default: result = (int64_t)frb; break;
++          }
++        }
++        if (invalid) FPSCR_ |= (1ULL << 8);  /* VXCVI: PPC bit 23 in low-half layout */
++        setFpuRegister(rt, result);
++        break;
++      }
++      case 815: {
++        // fctidz: convert double to int64 (round toward zero)
++        double frb = getFpuRegisterDouble(rb);
++        int64_t result;
++        bool invalid = false;
++        if (std::isnan(frb)) {
++          result = INT64_MIN;
++          invalid = true;
++        } else if (frb >= -(double)INT64_MIN) {
++          result = INT64_MAX;
++          invalid = true;
++        } else if (frb < (double)INT64_MIN) {
++          result = INT64_MIN;
++          invalid = true;
++        } else {
++          result = (int64_t)frb;
++        }
++        if (invalid) FPSCR_ |= (1ULL << 8);  /* VXCVI: PPC bit 23 in low-half layout */
++        setFpuRegister(rt, result);
++        break;
++      }
++      case 942: {
++        // fctidu: convert double to uint64 (current rounding).
++        // VXCVI is signaled when source is NaN, ±Inf, or the rounded value
++        // is outside [0, 2^64-1]. Notably,
++        // a negative source whose rounded value is 0 (e.g. -0.4 in RN, or
++        // any value in (-1, 0) in RZ) is NOT invalid.
++        double frb = getFpuRegisterDouble(rb);
++        uint64_t result;
++        bool invalid = false;
++        if (std::isnan(frb)) {
++          result = 0;
++          invalid = true;
++        } else if (frb >= -2.0 * (double)INT64_MIN /* 2^64 */) {
++          result = UINT64_MAX;
++          invalid = true;
++        } else {
++          double rounded;
++          switch (FPSCR_ & kFPSCRRNMask) {
++            case RN: rounded = nearbyint(frb); break;
++            case RZ: rounded = trunc(frb); break;
++            case RP: rounded = ceil(frb); break;
++            case RM: rounded = floor(frb); break;
++            default: rounded = trunc(frb); break;
++          }
++          if (rounded < 0.0) {
++            result = 0;
++            invalid = true;
++          } else {
++            result = (uint64_t)rounded;
++          }
++        }
++        if (invalid) FPSCR_ |= (1ULL << 8);  /* VXCVI: PPC bit 23 in low-half layout */
++        setFpuRegister(rt, I64(result));
++        break;
++      }
++      case 943: {
++        // fctiduz: convert double to uint64 (round toward zero).
++        // Same VXCVI rule as fctidu but rounding is fixed to truncate
++        // toward zero. Source in (-1, 0) truncates to 0 — VALID.
++        double frb = getFpuRegisterDouble(rb);
++        uint64_t result;
++        bool invalid = false;
++        if (std::isnan(frb)) {
++          result = 0;
++          invalid = true;
++        } else if (frb >= -2.0 * (double)INT64_MIN /* 2^64 */) {
++          result = UINT64_MAX;
++          invalid = true;
++        } else if (frb <= -1.0) {
++          // Truncated value is negative — invalid for unsigned.
++          result = 0;
++          invalid = true;
++        } else {
++          // Source is in (-1, 2^64); truncation toward zero yields a value
++          // in [0, 2^64).
++          result = (uint64_t)trunc(frb);
++        }
++        if (invalid) FPSCR_ |= (1ULL << 8);  /* VXCVI: PPC bit 23 in low-half layout */
++        setFpuRegister(rt, I64(result));
++        break;
++      }
++      case 14: {
++        // fctiw: convert double to int32 (current rounding).
++        // Invalid range: rounded value < INT32_MIN or > INT32_MAX. The
++        // double-precision boundary on the negative side is INT32_MIN-1 =
++        // -2^31-1 = -2147483649.0 (exactly representable; doubles in
++        // (-2^31-1, -2^31) all round-to-nearest to -2^31 which is valid).
++        double frb = getFpuRegisterDouble(rb);
++        int32_t result;
++        bool invalid = false;
++        if (std::isnan(frb)) {
++          result = INT32_MIN;
++          invalid = true;
++        } else {
++          double rounded;
++          switch (FPSCR_ & kFPSCRRNMask) {
++            case RN: rounded = nearbyint(frb); break;
++            case RZ: rounded = trunc(frb); break;
++            case RP: rounded = ceil(frb); break;
++            case RM: rounded = floor(frb); break;
++            default: rounded = trunc(frb); break;
++          }
++          if (rounded > (double)INT32_MAX) {
++            result = INT32_MAX;
++            invalid = true;
++          } else if (rounded < (double)INT32_MIN) {
++            result = INT32_MIN;
++            invalid = true;
++          } else {
++            result = (int32_t)rounded;
++          }
++        }
++        if (invalid) FPSCR_ |= (1ULL << 8);  /* VXCVI: PPC bit 23 in low-half layout */
++        setFpuRegister(rt, (int64_t)result);
++        break;
++      }
++      case 15: {
++        // fctiwz: convert double to int32 (round toward zero).
++        // Truncation of a value in (-2^31-1, INT32_MIN) toward zero gives
++        // INT32_MIN — valid. Only `frb <= -2^31-1` (i.e. `frb < INT32_MIN-1+1`
++        // = `frb < -2147483648` ... wait, simplest: check truncated value in
++        // range AFTER truncation.)
++        double frb = getFpuRegisterDouble(rb);
++        int32_t result;
++        bool invalid = false;
++        if (std::isnan(frb)) {
++          result = INT32_MIN;
++          invalid = true;
++        } else {
++          double truncated = trunc(frb);
++          if (truncated > (double)INT32_MAX) {
++            result = INT32_MAX;
++            invalid = true;
++          } else if (truncated < (double)INT32_MIN) {
++            result = INT32_MIN;
++            invalid = true;
++          } else {
++            result = (int32_t)truncated;
++          }
++        }
++        if (invalid) FPSCR_ |= (1ULL << 8);  /* VXCVI: PPC bit 23 in low-half layout */
++        setFpuRegister(rt, (int64_t)result);
++        break;
++      }
++      case 142: {
++        // fctiwu: convert double to uint32 (current rounding). The check is
++        // on the ROUNDED value: VXCVI iff rounded < 0 or rounded > UINT32_MAX.
++        double frb = getFpuRegisterDouble(rb);
++        uint32_t result;
++        bool invalid = false;
++        if (std::isnan(frb)) {
++          result = 0;
++          invalid = true;
++        } else {
++          double rounded;
++          switch (FPSCR_ & kFPSCRRNMask) {
++            case RN: rounded = nearbyint(frb); break;
++            case RZ: rounded = trunc(frb); break;
++            case RP: rounded = ceil(frb); break;
++            case RM: rounded = floor(frb); break;
++            default: rounded = trunc(frb); break;
++          }
++          if (rounded < 0.0) {
++            result = 0;
++            invalid = true;
++          } else if (rounded > (double)UINT32_MAX) {
++            result = UINT32_MAX;
++            invalid = true;
++          } else {
++            result = (uint32_t)rounded;
++          }
++        }
++        if (invalid) FPSCR_ |= (1ULL << 8);  /* VXCVI: PPC bit 23 in low-half layout */
++        setFpuRegister(rt, (int64_t)(uint64_t)result);
++        break;
++      }
++      case 143: {
++        // fctiwuz: convert double to uint32 (round toward zero).
++        // Source in (-1, 0) truncates to 0 — VALID.
++        double frb = getFpuRegisterDouble(rb);
++        uint32_t result;
++        bool invalid = false;
++        if (std::isnan(frb)) {
++          result = 0;
++          invalid = true;
++        } else {
++          double truncated = trunc(frb);
++          if (truncated > (double)UINT32_MAX) {
++            result = UINT32_MAX;
++            invalid = true;
++          } else if (truncated < 0.0) {
++            result = 0;
++            invalid = true;
++          } else {
++            result = (uint32_t)truncated;
++          }
++        }
++        if (invalid) FPSCR_ |= (1ULL << 8);  /* VXCVI: PPC bit 23 in low-half layout */
++        setFpuRegister(rt, (int64_t)(uint64_t)result);
++        break;
++      }
++      case 846: {
++        // fcfid: convert int64 in FPR to double
++        int64_t val = getFpuRegister(rb);
++        setFpuRegisterDouble(rt, (double)val);
++        break;
++      }
++      case 974: {
++        // fcfidu: convert uint64 in FPR to double
++        uint64_t val = U64(getFpuRegister(rb));
++        setFpuRegisterDouble(rt, (double)val);
++        break;
++      }
++      case 12: {
++        // frsp: round double to single precision (then re-extend in FPR).
++        // sNaN inputs are quieted (the result payload MSB is set).
++        // wasm f32.demote_f64 lowers to this op when
++        // not using xscvdpsp directly.
++        double frb = getFpuRegisterDouble(rb);
++        float result = demoteDoublePreservingNaN(frb);
++        uint32_t fbits;
++        memcpy(&fbits, &result, sizeof(fbits));
++        if ((fbits & 0x7F800000u) == 0x7F800000u &&
++            (fbits & 0x007FFFFFu) != 0) {
++          fbits |= 0x00400000u;
++          memcpy(&result, &fbits, sizeof(result));
++        }
++        setFpuRegisterDouble(rt, promoteFloatPreservingNaN(result));
++        break;
++      }
++      case 392: {
++        // frin: round to nearest integer (ties away from zero)
++        double frb = getFpuRegisterDouble(rb);
++        setFpuRegisterDouble(rt, round(frb));
++        break;
++      }
++      case 424: {
++        // friz: round toward zero
++        double frb = getFpuRegisterDouble(rb);
++        setFpuRegisterDouble(rt, trunc(frb));
++        break;
++      }
++      case 456: {
++        // frip: round toward +infinity (ceil). XO=456.
++        double frb = getFpuRegisterDouble(rb);
++        setFpuRegisterDouble(rt, ceil(frb));
++        break;
++      }
++      case 488: {
++        // frim: round toward -infinity (floor). XO=488.
++        double frb = getFpuRegisterDouble(rb);
++        setFpuRegisterDouble(rt, floor(frb));
++        break;
++      }
++      case 583: {
++        // mffs: FRT = FPSCR (as double bit pattern)
++        setFpuRegister(rt, I64(FPSCR_));
++        break;
++      }
++      // FPSCR is treated as a 32-bit register stored in the low 32 bits of
++      // FPSCR_ (uint64_t), with PPC bit numbering: PPC bit N (where bit 0 is
++      // the MSB) lives at int64 bit (31-N). Field F (4 bits) covers PPC bits
++      // 4F..4F+3 → int64 bit-LSB (28-4F) to bit-MSB (31-4F). This matches
++      // mcrfs, mtfsfi, kFPSCRRNMask (which checks bits 30-31 PPC = int64 bits
++      // 0-1), and mffs (which copies FPSCR into FPR bits 32..63 PPC = int64
++      // bits 0..31). Earlier mtfsb0/mtfsb1 used (63-bt) which placed bits in
++      // the high half of FPSCR_ where mcrfs etc. would never see them — so
++      // the wasm trap sequence `mtfsb0 23; fctidz; mcrfs cr0,5; bt SO,oolEntry`
++      // could not detect VXCVI.
++      case 70: {
++        // mtfsb0: clear FPSCR bit. XO=70.
++        // (Cases 38 and 70 had the labels swapped, so wasm's
++        // `mtfsb0 23; fctidz; mcrfs cr0,5; bt SO,trap` sequence accidentally
++        // SET VXCVI before the convert ran, causing every fctid* to trap.)
++        uint32_t bt = instr->rtValue();
++        FPSCR_ &= ~(1ULL << (31 - bt));
++        break;
++      }
++      case 64: {
++        // mcrfs: copy FPSCR field to CR field
++        uint32_t bf = instr->bfValue();
++        uint32_t bfa = instr->bits(20, 18);
++        uint32_t shift = 4 * (7 - bfa);
++        uint8_t val = (FPSCR_ >> shift) & 0xF;
++        setCRField(bf, val);
++        break;
++      }
++      default:
++        MOZ_CRASH_UNSAFE_PRINTF(
++            "decodeFP: opcode 63, xo_x=%u (instruction 0x%08x)", xo_x,
++            instr->instructionBits());
++    }
++  } else if (opcode == 59) {
++    // A-form single-precision instructions.
++    uint32_t xo_a = instr->bits(5, 1);
++
++    switch (xo_a) {
++      case 21: {
++        // fadds
++        double result = (double)((float)(getFpuRegisterDouble(ra) +
++                                         getFpuRegisterDouble(rb)));
++        setFpuRegisterDouble(rt, result);
++        break;
++      }
++      case 20: {
++        // fsubs
++        double result = (double)((float)(getFpuRegisterDouble(ra) -
++                                         getFpuRegisterDouble(rb)));
++        setFpuRegisterDouble(rt, result);
++        break;
++      }
++      case 25: {
++        // fmuls: FRT = (float)(FRA * FRC)
++        double result = (double)((float)(getFpuRegisterDouble(ra) *
++                                         getFpuRegisterDouble(rc_reg)));
++        setFpuRegisterDouble(rt, result);
++        break;
++      }
++      case 18: {
++        // fdivs
++        double result = (double)((float)(getFpuRegisterDouble(ra) /
++                                         getFpuRegisterDouble(rb)));
++        setFpuRegisterDouble(rt, result);
++        break;
++      }
++      case 22: {
++        // fsqrts
++        double result = (double)sqrtf((float)getFpuRegisterDouble(rb));
++        setFpuRegisterDouble(rt, result);
++        break;
++      }
++      case 29: {
++        // fmadds
++        double result = (double)((float)std::fma(getFpuRegisterDouble(ra),
++                                                 getFpuRegisterDouble(rc_reg),
++                                                 getFpuRegisterDouble(rb)));
++        setFpuRegisterDouble(rt, result);
++        break;
++      }
++      case 30: {
++        // fnmsubs
++        double result = (double)(-(float)std::fma(getFpuRegisterDouble(ra),
++                                                  getFpuRegisterDouble(rc_reg),
++                                                  -getFpuRegisterDouble(rb)));
++        setFpuRegisterDouble(rt, result);
++        break;
++      }
++      case 28: {
++        // fmsubs
++        double result = (double)((float)std::fma(getFpuRegisterDouble(ra),
++                                                 getFpuRegisterDouble(rc_reg),
++                                                 -getFpuRegisterDouble(rb)));
++        setFpuRegisterDouble(rt, result);
++        break;
++      }
++      case 31: {
++        // fnmadds
++        double result = (double)(-(float)std::fma(getFpuRegisterDouble(ra),
++                                                  getFpuRegisterDouble(rc_reg),
++                                                  getFpuRegisterDouble(rb)));
++        setFpuRegisterDouble(rt, result);
++        break;
++      }
++      default: {
++        // Try X-form sub-opcodes for opcode 59 (e.g., fcfids, fcfidus).
++        uint32_t xo_x = instr->bits(10, 1);
++        switch (xo_x) {
++          case 846: {
++            // fcfids: convert int64 to float single (result stored as double)
++            int64_t val = getFpuRegister(rb);
++            setFpuRegisterDouble(rt, (double)(float)val);
++            break;
++          }
++          case 974: {
++            // fcfidus: convert uint64 to float single
++            uint64_t val = U64(getFpuRegister(rb));
++            setFpuRegisterDouble(rt, (double)(float)val);
++            break;
++          }
++          default:
++            MOZ_CRASH_UNSAFE_PRINTF(
++                "decodeFP: opcode 59, xo_a=%u xo_x=%u", xo_a, xo_x);
++        }
++        break;
++      }
++    }
++  } else {
++    MOZ_CRASH_UNSAFE_PRINTF("decodeFP: opcode=%u", opcode);
++  }
++}
++
++// -----------------------------------------------------------------------------
++// decodeVMX: Major opcode 4 (AltiVec/VMX vector ops on VR0-VR31).
++//
++// VR-form (VX-form): bits 0-5 = primary opcode (4), bits 6-10 = VRT,
++// bits 11-15 = VRA, bits 16-20 = VRB, bits 21-31 = XO (11 bits).
++// XO extracted via `instructionBits() & 0x7FF`.
++//
++// Helpers below pack/unpack each VR via the VRregisters_ byte storage
++// (16 bytes, big-endian PPC numbering: bytes[0] is the most-significant
++// byte of the architectural register, but on PPC64 LE wasm the lane
++// ordering is what the JIT expects). All ops here use byte-level
++// accessors for consistency with the existing VMX memory ops.
++
++void Simulator::decodeVMX(SimInstruction* instr) {
++  uint32_t xo = instr->instructionBits() & 0x7FFu;
++  uint32_t vrt = instr->rtValue();   // bits 6..10
++  uint32_t vra = instr->raValue();   // bits 11..15
++  uint32_t vrb = instr->rbValue();   // bits 16..20
++  uint32_t uimm = instr->raValue();  // VA-form: 5-bit immediate at bits 11..15
++
++  uint8_t a[16], b[16], r[16];
++  getVRBytes(vra, a);
++  getVRBytes(vrb, b);
++
++  // Helpers for treating the byte storage as typed lane arrays.
++  // The PPC64LE wasm SIMD lowering stores each lane's bytes in
++  // little-endian order, so lane i of an N-byte element occupies bytes
++  // (i*N) .. (i*N + N - 1) with the LSB at byte (i*N). For example,
++  // a v128.const i32x4 0x12345678 has bytes [78 56 34 12 …].
++  #define LANE_U8(buf, i)  ((uint8_t)(buf)[(i)])
++  #define LANE_S8(buf, i)  ((int8_t)(buf)[(i)])
++  #define LANE_U16(buf, i)                                     \
++    ((uint16_t)((uint16_t)(buf)[(i) * 2] |                    \
++                ((uint16_t)(buf)[(i) * 2 + 1] << 8)))
++  #define LANE_S16(buf, i) ((int16_t)LANE_U16(buf, i))
++  #define LANE_U32(buf, i)                                     \
++    ((uint32_t)((uint32_t)(buf)[(i) * 4] |                    \
++                ((uint32_t)(buf)[(i) * 4 + 1] << 8) |         \
++                ((uint32_t)(buf)[(i) * 4 + 2] << 16) |        \
++                ((uint32_t)(buf)[(i) * 4 + 3] << 24)))
++  #define LANE_S32(buf, i) ((int32_t)LANE_U32(buf, i))
++  #define LANE_U64(buf, i)                                     \
++    ((uint64_t)((uint64_t)(buf)[(i) * 8] |                    \
++                ((uint64_t)(buf)[(i) * 8 + 1] << 8) |         \
++                ((uint64_t)(buf)[(i) * 8 + 2] << 16) |        \
++                ((uint64_t)(buf)[(i) * 8 + 3] << 24) |        \
++                ((uint64_t)(buf)[(i) * 8 + 4] << 32) |        \
++                ((uint64_t)(buf)[(i) * 8 + 5] << 40) |        \
++                ((uint64_t)(buf)[(i) * 8 + 6] << 48) |        \
++                ((uint64_t)(buf)[(i) * 8 + 7] << 56)))
++  #define LANE_S64(buf, i) ((int64_t)LANE_U64(buf, i))
++  #define SET_LANE_U8(buf, i, v)  do { (buf)[(i)] = (uint8_t)(v); } while (0)
++  #define SET_LANE_U16(buf, i, v) do {                                       \
++      (buf)[(i) * 2]     = (uint8_t)((uint16_t)(v) & 0xFF);                 \
++      (buf)[(i) * 2 + 1] = (uint8_t)(((uint16_t)(v) >> 8) & 0xFF);          \
++    } while (0)
++  #define SET_LANE_U32(buf, i, v) do {                                       \
++      (buf)[(i) * 4]     = (uint8_t)((uint32_t)(v) & 0xFF);                 \
++      (buf)[(i) * 4 + 1] = (uint8_t)(((uint32_t)(v) >> 8) & 0xFF);          \
++      (buf)[(i) * 4 + 2] = (uint8_t)(((uint32_t)(v) >> 16) & 0xFF);         \
++      (buf)[(i) * 4 + 3] = (uint8_t)(((uint32_t)(v) >> 24) & 0xFF);         \
++    } while (0)
++  #define SET_LANE_U64(buf, i, v) do {                                       \
++      (buf)[(i) * 8]     = (uint8_t)((uint64_t)(v) & 0xFF);                 \
++      (buf)[(i) * 8 + 1] = (uint8_t)(((uint64_t)(v) >> 8) & 0xFF);          \
++      (buf)[(i) * 8 + 2] = (uint8_t)(((uint64_t)(v) >> 16) & 0xFF);         \
++      (buf)[(i) * 8 + 3] = (uint8_t)(((uint64_t)(v) >> 24) & 0xFF);         \
++      (buf)[(i) * 8 + 4] = (uint8_t)(((uint64_t)(v) >> 32) & 0xFF);         \
++      (buf)[(i) * 8 + 5] = (uint8_t)(((uint64_t)(v) >> 40) & 0xFF);         \
++      (buf)[(i) * 8 + 6] = (uint8_t)(((uint64_t)(v) >> 48) & 0xFF);         \
++      (buf)[(i) * 8 + 7] = (uint8_t)(((uint64_t)(v) >> 56) & 0xFF);         \
++    } while (0)
++
++  // --- VA-form pre-dispatch ---
++  //
++  // VA-form has a 6-bit XO at bits 26-31 and a 5-bit VRC at bits 21-25.
++  // decodeVMX's 11-bit XO mask conflates VRC with
++  // XO, so a plain `switch (xo)` over 11-bit values only matches when
++  // VRC == 0. Peel off the three VA-form ops actually used by the JIT
++  // (vmladduhm, vsel, vperm) before the main switch so any VRC value
++  // works. vsldoi (XO=44) is VX-form with SH at bits 22-25, not VA —
++  // handled in the switch below.
++  {
++    uint32_t va_xo = xo & 0x3Fu;
++    if (va_xo == 32 || va_xo == 33 || va_xo == 34 || va_xo == 38 ||
++        va_xo == 40 || va_xo == 42 || va_xo == 43) {
++      uint32_t vrc = (instr->instructionBits() >> 6) & 0x1F;
++      uint8_t cv[16];
++      getVRBytes(vrc, cv);
++      if (va_xo == 32) {
++        // vmhaddshs VT,VA,VB,VC : VT[i] = sat_s16(
++        //   (s32)VA.h[i] * (s32)VB.h[i] >> 15 + (s32)VC.h[i])
++        // (no rounding term — use vmhraddshs for the rounded form).
++        for (int i = 0; i < 8; i++) {
++          int32_t prod = (int32_t)LANE_S16(a, i) * (int32_t)LANE_S16(b, i);
++          int32_t sum = (prod >> 15) + (int32_t)LANE_S16(cv, i);
++          if (sum > INT16_MAX) sum = INT16_MAX;
++          if (sum < INT16_MIN) sum = INT16_MIN;
++          SET_LANE_U16(r, i, (uint16_t)(int16_t)sum);
++        }
++      } else if (va_xo == 33) {
++        // vmhraddshs VT,VA,VB,VC : rounded Q15 multiply-add-saturate.
++        //   VT[i] = sat_s16(((s32)VA.h[i] * (s32)VB.h[i] + 0x4000)
++        //                   >> 15 + (s32)VC.h[i])
++        // Used by wasm i16x8.q15mulr_sat_s (VC is zero).
++        for (int i = 0; i < 8; i++) {
++          int32_t prod = (int32_t)LANE_S16(a, i) * (int32_t)LANE_S16(b, i);
++          int32_t sum = ((prod + 0x4000) >> 15) + (int32_t)LANE_S16(cv, i);
++          if (sum > INT16_MAX) sum = INT16_MAX;
++          if (sum < INT16_MIN) sum = INT16_MIN;
++          SET_LANE_U16(r, i, (uint16_t)(int16_t)sum);
++        }
++      } else if (va_xo == 34) {
++        // vmladduhm VT,VA,VB,VC : VT = low16(VA*VB + VC)
++        for (int i = 0; i < 8; i++) {
++          uint16_t prod = LANE_U16(a, i) * LANE_U16(b, i);
++          SET_LANE_U16(r, i, prod + LANE_U16(cv, i));
++        }
++      } else if (va_xo == 40) {
++        // vmsumshm VT,VA,VB,VC : pairwise multiply-sum of signed halfwords
++        // into i32 lanes, modulo i32 wrap.
++        //   VT.i32[k] = VC.i32[k] + VA.i16[2k]*VB.i16[2k]
++        //                         + VA.i16[2k+1]*VB.i16[2k+1]
++        // Used by wasm i32x4.dot_i16x8_s with VC = 0, and by
++        // i32x4.extadd_pairwise_i16x8_s with VB = splat(1) and VC = 0.
++        for (int k = 0; k < 4; k++) {
++          int32_t a0 = (int32_t)LANE_S16(a, 2 * k);
++          int32_t a1 = (int32_t)LANE_S16(a, 2 * k + 1);
++          int32_t b0 = (int32_t)LANE_S16(b, 2 * k);
++          int32_t b1 = (int32_t)LANE_S16(b, 2 * k + 1);
++          int32_t c  = LANE_S32(cv, k);
++          int32_t result = (int32_t)((uint32_t)c + (uint32_t)(a0 * b0) +
++                                     (uint32_t)(a1 * b1));
++          SET_LANE_U32(r, k, (uint32_t)result);
++        }
++      } else if (va_xo == 38) {
++        // vmsumuhm VT,VA,VB,VC : same as vmsumshm but unsigned halfwords.
++        //   VT.u32[k] = VC.u32[k] + VA.u16[2k]*VB.u16[2k]
++        //                         + VA.u16[2k+1]*VB.u16[2k+1]
++        // Used by wasm i32x4.extadd_pairwise_i16x8_u with VB = splat(1)
++        // and VC = 0.
++        for (int k = 0; k < 4; k++) {
++          uint32_t a0 = (uint32_t)LANE_U16(a, 2 * k);
++          uint32_t a1 = (uint32_t)LANE_U16(a, 2 * k + 1);
++          uint32_t b0 = (uint32_t)LANE_U16(b, 2 * k);
++          uint32_t b1 = (uint32_t)LANE_U16(b, 2 * k + 1);
++          uint32_t c  = LANE_U32(cv, k);
++          uint32_t result = c + a0 * b0 + a1 * b1;
++          SET_LANE_U32(r, k, result);
++        }
++      } else if (va_xo == 42) {
++        // vsel VT,VA,VB,VC : VT[i] = (VC[i] & VB[i]) | (~VC[i] & VA[i])
++        for (int i = 0; i < 16; i++) {
++          r[i] = (uint8_t)((cv[i] & b[i]) | (~cv[i] & a[i]));
++        }
++      } else {
++        // vperm VT,VA,VB,VC; empirical LE:
++        //   r[LE_i] = (VC[LE_i] < 16) ? VA[LE_(15-VC[i])]
++        //                             : VB[LE_(31-VC[i])]
++        for (int i = 0; i < 16; i++) {
++          uint8_t idx = cv[i] & 0x1F;
++          r[i] = (idx < 16) ? a[15 - idx] : b[31 - idx];
++        }
++      }
++      setVRBytes(vrt, r);
++      goto vmx_done;
++    }
++  }
++
++  switch (xo) {
++    // === Integer add (modulo) ===
++    case 0:    // vaddubm
++      for (int i = 0; i < 16; i++) {
++        SET_LANE_U8(r, i, LANE_U8(a, i) + LANE_U8(b, i));
++      }
++      setVRBytes(vrt, r); break;
++    case 64:   // vadduhm
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i, LANE_U16(a, i) + LANE_U16(b, i));
++      }
++      setVRBytes(vrt, r); break;
++    case 128:  // vadduwm
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i, LANE_U32(a, i) + LANE_U32(b, i));
++      }
++      setVRBytes(vrt, r); break;
++    case 192:  // vaddudm
++      for (int i = 0; i < 2; i++) {
++        SET_LANE_U64(r, i, LANE_U64(a, i) + LANE_U64(b, i));
++      }
++      setVRBytes(vrt, r); break;
++
++    // === Integer sub (modulo) ===
++    case 1024: // vsububm
++      for (int i = 0; i < 16; i++) {
++        SET_LANE_U8(r, i, LANE_U8(a, i) - LANE_U8(b, i));
++      }
++      setVRBytes(vrt, r); break;
++    case 1088: // vsubuhm
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i, LANE_U16(a, i) - LANE_U16(b, i));
++      }
++      setVRBytes(vrt, r); break;
++    case 1152: // vsubuwm
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i, LANE_U32(a, i) - LANE_U32(b, i));
++      }
++      setVRBytes(vrt, r); break;
++    case 1216: // vsubudm
++      for (int i = 0; i < 2; i++) {
++        SET_LANE_U64(r, i, LANE_U64(a, i) - LANE_U64(b, i));
++      }
++      setVRBytes(vrt, r); break;
++
++    // === Integer add (saturating, signed) ===
++    case 768:  // vaddsbs
++      for (int i = 0; i < 16; i++) {
++        int s = (int)LANE_S8(a, i) + (int)LANE_S8(b, i);
++        if (s > INT8_MAX) s = INT8_MAX;
++        if (s < INT8_MIN) s = INT8_MIN;
++        SET_LANE_U8(r, i, (uint8_t)s);
++      }
++      setVRBytes(vrt, r); break;
++    case 832:  // vaddshs
++      for (int i = 0; i < 8; i++) {
++        int s = (int)LANE_S16(a, i) + (int)LANE_S16(b, i);
++        if (s > INT16_MAX) s = INT16_MAX;
++        if (s < INT16_MIN) s = INT16_MIN;
++        SET_LANE_U16(r, i, (uint16_t)s);
++      }
++      setVRBytes(vrt, r); break;
++    case 896:  // vaddsws
++      for (int i = 0; i < 4; i++) {
++        int64_t s = (int64_t)LANE_S32(a, i) + (int64_t)LANE_S32(b, i);
++        if (s > INT32_MAX) s = INT32_MAX;
++        if (s < INT32_MIN) s = INT32_MIN;
++        SET_LANE_U32(r, i, (uint32_t)s);
++      }
++      setVRBytes(vrt, r); break;
++
++    // === Integer add (saturating, unsigned) ===
++    case 512:  // vaddubs
++      for (int i = 0; i < 16; i++) {
++        unsigned s = (unsigned)LANE_U8(a, i) + (unsigned)LANE_U8(b, i);
++        if (s > UINT8_MAX) s = UINT8_MAX;
++        SET_LANE_U8(r, i, (uint8_t)s);
++      }
++      setVRBytes(vrt, r); break;
++    case 576:  // vadduhs
++      for (int i = 0; i < 8; i++) {
++        unsigned s = (unsigned)LANE_U16(a, i) + (unsigned)LANE_U16(b, i);
++        if (s > UINT16_MAX) s = UINT16_MAX;
++        SET_LANE_U16(r, i, (uint16_t)s);
++      }
++      setVRBytes(vrt, r); break;
++    case 640:  // vadduws
++      for (int i = 0; i < 4; i++) {
++        uint64_t s = (uint64_t)LANE_U32(a, i) + (uint64_t)LANE_U32(b, i);
++        if (s > UINT32_MAX) s = UINT32_MAX;
++        SET_LANE_U32(r, i, (uint32_t)s);
++      }
++      setVRBytes(vrt, r); break;
++
++    // === Integer sub (saturating, signed) ===
++    case 1792: // vsubsbs
++      for (int i = 0; i < 16; i++) {
++        int s = (int)LANE_S8(a, i) - (int)LANE_S8(b, i);
++        if (s > INT8_MAX) s = INT8_MAX;
++        if (s < INT8_MIN) s = INT8_MIN;
++        SET_LANE_U8(r, i, (uint8_t)s);
++      }
++      setVRBytes(vrt, r); break;
++    case 1856: // vsubshs
++      for (int i = 0; i < 8; i++) {
++        int s = (int)LANE_S16(a, i) - (int)LANE_S16(b, i);
++        if (s > INT16_MAX) s = INT16_MAX;
++        if (s < INT16_MIN) s = INT16_MIN;
++        SET_LANE_U16(r, i, (uint16_t)s);
++      }
++      setVRBytes(vrt, r); break;
++
++    // === Integer sub (saturating, unsigned) ===
++    case 1536: // vsububs
++      for (int i = 0; i < 16; i++) {
++        int s = (int)LANE_U8(a, i) - (int)LANE_U8(b, i);
++        if (s < 0) s = 0;
++        SET_LANE_U8(r, i, (uint8_t)s);
++      }
++      setVRBytes(vrt, r); break;
++    case 1600: // vsubuhs
++      for (int i = 0; i < 8; i++) {
++        int s = (int)LANE_U16(a, i) - (int)LANE_U16(b, i);
++        if (s < 0) s = 0;
++        SET_LANE_U16(r, i, (uint16_t)s);
++      }
++      setVRBytes(vrt, r); break;
++
++    // === Average unsigned (rounded: (a+b+1)>>1) ===
++    case 1026: // vavgub
++      for (int i = 0; i < 16; i++) {
++        SET_LANE_U8(r, i,
++                    ((unsigned)LANE_U8(a, i) + LANE_U8(b, i) + 1) >> 1);
++      }
++      setVRBytes(vrt, r); break;
++    case 1090: // vavguh
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i,
++                     ((unsigned)LANE_U16(a, i) + LANE_U16(b, i) + 1) >> 1);
++      }
++      setVRBytes(vrt, r); break;
++
++    // === Vector multiply per-lane (i32x4.mul) ===
++    case 137: { // vmuluwm: per-lane i32 multiply (low 32 bits)
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i, LANE_U32(a, i) * LANE_U32(b, i));
++      }
++      setVRBytes(vrt, r); break;
++    }
++
++    // === POWER10 vmulld: per-lane i64 multiply (low 64 bits) ===
++    case 457: {
++      for (int i = 0; i < 2; i++) {
++        uint64_t av = 0, bv = 0;
++        for (int j = 0; j < 8; j++) {
++          av |= ((uint64_t)a[i * 8 + j]) << (j * 8);
++          bv |= ((uint64_t)b[i * 8 + j]) << (j * 8);
++        }
++        uint64_t prod = av * bv;  // low 64 bits, modulo wrap
++        for (int j = 0; j < 8; j++) {
++          r[i * 8 + j] = (uint8_t)(prod >> (j * 8));
++        }
++      }
++      setVRBytes(vrt, r); break;
++    }
++
++    // === vmule/vmulo* (multiply even/odd lanes, widening) ===
++    //
++    // All XO values below were verified by disassembling the
++    // PPC_vmule*/PPC_vmulo* constants from Assembler-ppc64.h with
++    // `as -mppc64 -mlittle` + `objdump -Mpower9 -d`. The previous
++    // version had all 12 XO labels swapped with each other's semantic
++    // pair (so the JIT's vmulesb was decoded as vmulosb and vice
++    // versa), causing i8x16→i16x8 extmul to produce wrong halfwords.
++    //
++    //   PPC_vmuloub = 0x10000008 → XO=8     vmuloub (LE even-byte pairs)
++    //   PPC_vmulouh = 0x10000048 → XO=72    vmulouh
++    //   PPC_vmulouw = 0x10000088 → XO=136   vmulouw
++    //   PPC_vmulosb = 0x10000108 → XO=264   vmulosb
++    //   PPC_vmulosh = 0x10000148 → XO=328   vmulosh
++    //   PPC_vmulosw = 0x10000188 → XO=392   vmulosw
++    //   PPC_vmuleub = 0x10000208 → XO=520   vmuleub (LE odd-byte pairs)
++    //   PPC_vmuleuh = 0x10000248 → XO=584   vmuleuh
++    //   PPC_vmuleuw = 0x10000288 → XO=648   vmuleuw
++    //   PPC_vmulesb = 0x10000308 → XO=776   vmulesb
++    //   PPC_vmulesh = 0x10000348 → XO=840   vmulesh
++    //   PPC_vmulesw = 0x10000388 → XO=904   vmulesw
++    //
++    // Lane indexing on LE storage: "BE-even byte i" is stored at LE
++    // byte index (15 - 2i); since our LANE_S8 uses LE byte index, the
++    // "BE-even" = "LE-odd" mapping gives `2*i + 1` for vmule, `2*i`
++    // for vmulo. The JIT's extmul helpers emit `vmulesb + vmulosb +
++    // vmrglh` to pack both halves; getting the semantics swapped here
++    // produces the right result register but with the halves in the
++    // wrong merge order, breaking extmul.
++    case 776: { // vmulesb: signed BE-even byte → halfword (8 results)
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i,
++                     (int16_t)LANE_S8(a, 2 * i + 1) *
++                     (int16_t)LANE_S8(b, 2 * i + 1));
++      }
++      setVRBytes(vrt, r); break;
++    }
++    case 520: { // vmuleub: unsigned BE-even byte → halfword
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i,
++                     (uint16_t)LANE_U8(a, 2 * i + 1) *
++                     (uint16_t)LANE_U8(b, 2 * i + 1));
++      }
++      setVRBytes(vrt, r); break;
++    }
++    case 840: { // vmulesh: signed BE-even halfword → word
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i,
++                     (int32_t)LANE_S16(a, 2 * i + 1) *
++                     (int32_t)LANE_S16(b, 2 * i + 1));
++      }
++      setVRBytes(vrt, r); break;
++    }
++    case 584: { // vmuleuh: unsigned BE-even halfword → word
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i,
++                     (uint32_t)LANE_U16(a, 2 * i + 1) *
++                     (uint32_t)LANE_U16(b, 2 * i + 1));
++      }
++      setVRBytes(vrt, r); break;
++    }
++    case 904: { // vmulesw: signed BE-even word → dword (POWER8)
++      for (int i = 0; i < 2; i++) {
++        SET_LANE_U64(r, i,
++                     (int64_t)LANE_S32(a, 2 * i + 1) *
++                     (int64_t)LANE_S32(b, 2 * i + 1));
++      }
++      setVRBytes(vrt, r); break;
++    }
++    case 648: { // vmuleuw: unsigned BE-even word → dword (POWER8)
++      for (int i = 0; i < 2; i++) {
++        SET_LANE_U64(r, i,
++                     (uint64_t)LANE_U32(a, 2 * i + 1) *
++                     (uint64_t)LANE_U32(b, 2 * i + 1));
++      }
++      setVRBytes(vrt, r); break;
++    }
++    case 264: { // vmulosb: signed BE-odd byte → halfword
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i,
++                     (int16_t)LANE_S8(a, 2 * i) *
++                     (int16_t)LANE_S8(b, 2 * i));
++      }
++      setVRBytes(vrt, r); break;
++    }
++    case 8: { // vmuloub: unsigned BE-odd byte
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i,
++                     (uint16_t)LANE_U8(a, 2 * i) *
++                     (uint16_t)LANE_U8(b, 2 * i));
++      }
++      setVRBytes(vrt, r); break;
++    }
++    case 328: { // vmulosh: signed BE-odd halfword → word
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i,
++                     (int32_t)LANE_S16(a, 2 * i) *
++                     (int32_t)LANE_S16(b, 2 * i));
++      }
++      setVRBytes(vrt, r); break;
++    }
++    case 72: { // vmulouh: unsigned BE-odd halfword → word
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i,
++                     (uint32_t)LANE_U16(a, 2 * i) *
++                     (uint32_t)LANE_U16(b, 2 * i));
++      }
++      setVRBytes(vrt, r); break;
++    }
++    case 392: { // vmulosw: signed BE-odd word
++      for (int i = 0; i < 2; i++) {
++        SET_LANE_U64(r, i,
++                     (int64_t)LANE_S32(a, 2 * i) *
++                     (int64_t)LANE_S32(b, 2 * i));
++      }
++      setVRBytes(vrt, r); break;
++    }
++    case 136: { // vmulouw: unsigned BE-odd word
++      for (int i = 0; i < 2; i++) {
++        SET_LANE_U64(r, i,
++                     (uint64_t)LANE_U32(a, 2 * i) *
++                     (uint64_t)LANE_U32(b, 2 * i));
++      }
++      setVRBytes(vrt, r); break;
++    }
++
++    // === Per-lane rotate left (vrl{b,h,w,d}) ===
++    case 4:    // vrlb
++      for (int i = 0; i < 16; i++) {
++        uint8_t v = LANE_U8(a, i);
++        uint32_t s = LANE_U8(b, i) & 7;
++        SET_LANE_U8(r, i, (uint8_t)((v << s) | (v >> ((8 - s) & 7))));
++      }
++      setVRBytes(vrt, r); break;
++    case 68:   // vrlh
++      for (int i = 0; i < 8; i++) {
++        uint16_t v = LANE_U16(a, i);
++        uint32_t s = LANE_U16(b, i) & 15;
++        SET_LANE_U16(r, i, (uint16_t)((v << s) | (v >> ((16 - s) & 15))));
++      }
++      setVRBytes(vrt, r); break;
++    case 132:  // vrlw
++      for (int i = 0; i < 4; i++) {
++        uint32_t v = LANE_U32(a, i);
++        uint32_t s = LANE_U32(b, i) & 31;
++        SET_LANE_U32(r, i, (v << s) | (v >> ((32 - s) & 31)));
++      }
++      setVRBytes(vrt, r); break;
++    case 196:  // vrld
++      for (int i = 0; i < 2; i++) {
++        uint64_t v = LANE_U64(a, i);
++        uint32_t s = LANE_U64(b, i) & 63;
++        SET_LANE_U64(r, i, (v << s) | (v >> ((64 - s) & 63)));
++      }
++      setVRBytes(vrt, r); break;
++
++    // === Min / Max signed ===
++    case 258:  // vmaxsb
++      for (int i = 0; i < 16; i++) {
++        SET_LANE_U8(r, i, std::max(LANE_S8(a, i), LANE_S8(b, i)));
++      }
++      setVRBytes(vrt, r); break;
++    case 322:  // vmaxsh
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i, std::max(LANE_S16(a, i), LANE_S16(b, i)));
++      }
++      setVRBytes(vrt, r); break;
++    case 386:  // vmaxsw
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i, std::max(LANE_S32(a, i), LANE_S32(b, i)));
++      }
++      setVRBytes(vrt, r); break;
++    case 450:  // vmaxsd
++      for (int i = 0; i < 2; i++) {
++        SET_LANE_U64(r, i, std::max(LANE_S64(a, i), LANE_S64(b, i)));
++      }
++      setVRBytes(vrt, r); break;
++    case 770:  // vminsb
++      for (int i = 0; i < 16; i++) {
++        SET_LANE_U8(r, i, std::min(LANE_S8(a, i), LANE_S8(b, i)));
++      }
++      setVRBytes(vrt, r); break;
++    case 834:  // vminsh
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i, std::min(LANE_S16(a, i), LANE_S16(b, i)));
++      }
++      setVRBytes(vrt, r); break;
++    case 898:  // vminsw
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i, std::min(LANE_S32(a, i), LANE_S32(b, i)));
++      }
++      setVRBytes(vrt, r); break;
++    case 962:  // vminsd
++      for (int i = 0; i < 2; i++) {
++        SET_LANE_U64(r, i, std::min(LANE_S64(a, i), LANE_S64(b, i)));
++      }
++      setVRBytes(vrt, r); break;
++
++    // === Min / Max unsigned ===
++    case 2:    // vmaxub
++      for (int i = 0; i < 16; i++) {
++        SET_LANE_U8(r, i, std::max(LANE_U8(a, i), LANE_U8(b, i)));
++      }
++      setVRBytes(vrt, r); break;
++    case 66:   // vmaxuh
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i, std::max(LANE_U16(a, i), LANE_U16(b, i)));
++      }
++      setVRBytes(vrt, r); break;
++    case 130:  // vmaxuw
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i, std::max(LANE_U32(a, i), LANE_U32(b, i)));
++      }
++      setVRBytes(vrt, r); break;
++    case 194:  // vmaxud
++      for (int i = 0; i < 2; i++) {
++        SET_LANE_U64(r, i, std::max(LANE_U64(a, i), LANE_U64(b, i)));
++      }
++      setVRBytes(vrt, r); break;
++    case 514:  // vminub
++      for (int i = 0; i < 16; i++) {
++        SET_LANE_U8(r, i, std::min(LANE_U8(a, i), LANE_U8(b, i)));
++      }
++      setVRBytes(vrt, r); break;
++    case 578:  // vminuh
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i, std::min(LANE_U16(a, i), LANE_U16(b, i)));
++      }
++      setVRBytes(vrt, r); break;
++    case 642:  // vminuw
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i, std::min(LANE_U32(a, i), LANE_U32(b, i)));
++      }
++      setVRBytes(vrt, r); break;
++    case 706:  // vminud
++      for (int i = 0; i < 2; i++) {
++        SET_LANE_U64(r, i, std::min(LANE_U64(a, i), LANE_U64(b, i)));
++      }
++      setVRBytes(vrt, r); break;
++
++    // === Vector compare (eq, gt signed, gt unsigned, ne POWER9) ===
++    //
++    // All vcmp* ops set per-lane all-1s on true, all-0s on false. The
++    // record form (Rc=1, XO MSB bit set; XO_rec = XO_base + 1024) must
++    // additionally write CR6:
++    //   CR6.LT = 1 iff ALL lanes are true;
++    //   CR6.GT = 0 (always);
++    //   CR6.EQ = 1 iff NO lane is true;
++    //   CR6.SO = 0 (always).
++    // `i8x16.all_true` etc. in wasm rely on CR6.EQ via `mfocrf cr6`; the
++    // previous simulator implementation left CR6 untouched, so the
++    // predicate was always wrong.
++    //
++    // Helper: count true lanes by looking at byte 0 of each lane (all
++    // bytes within a "true" lane are 0xFF so byte 0 is a sound proxy).
++    #define VCMP_DONE(lanes_, lane_bytes_)                                \
++      do {                                                                \
++        setVRBytes(vrt, r);                                                \
++        if (xo >= 1024) {                                                  \
++          int numTrue_ = 0;                                                \
++          for (int i_ = 0; i_ < (lanes_); i_++) {                          \
++            if (r[i_ * (lane_bytes_)] == 0xFF) numTrue_++;                 \
++          }                                                                \
++          uint8_t field_ = 0;                                              \
++          if (numTrue_ == (lanes_)) field_ |= kCRFieldLT;                  \
++          if (numTrue_ == 0) field_ |= kCRFieldEQ;                         \
++          setCRField(6, field_);                                           \
++        }                                                                  \
++      } while (0)
++
++    case 6:    // vcmpequb (Rc=0)
++    case 1030: // vcmpequb. (record, CR6 updated)
++      for (int i = 0; i < 16; i++) {
++        SET_LANE_U8(r, i, LANE_U8(a, i) == LANE_U8(b, i) ? 0xFF : 0);
++      }
++      VCMP_DONE(16, 1); break;
++    case 70:   // vcmpequh
++    case 1094: // vcmpequh.
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i, LANE_U16(a, i) == LANE_U16(b, i) ? 0xFFFF : 0);
++      }
++      VCMP_DONE(8, 2); break;
++    case 134:  // vcmpequw
++    case 1158: // vcmpequw.
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i,
++                     LANE_U32(a, i) == LANE_U32(b, i) ? 0xFFFFFFFFu : 0);
++      }
++      VCMP_DONE(4, 4); break;
++    case 199:  // vcmpequd
++    case 1223: // vcmpequd.
++      for (int i = 0; i < 2; i++) {
++        SET_LANE_U64(r, i,
++                     LANE_U64(a, i) == LANE_U64(b, i)
++                         ? UINT64_MAX
++                         : 0);
++      }
++      VCMP_DONE(2, 8); break;
++
++    // === Compare greater-than signed ===
++    case 774:  // vcmpgtsb
++    case 1798: // vcmpgtsb.
++      for (int i = 0; i < 16; i++) {
++        SET_LANE_U8(r, i, LANE_S8(a, i) > LANE_S8(b, i) ? 0xFF : 0);
++      }
++      VCMP_DONE(16, 1); break;
++    case 838:  // vcmpgtsh
++    case 1862: // vcmpgtsh.
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i, LANE_S16(a, i) > LANE_S16(b, i) ? 0xFFFF : 0);
++      }
++      VCMP_DONE(8, 2); break;
++    case 902:  // vcmpgtsw
++    case 1926: // vcmpgtsw.
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i,
++                     LANE_S32(a, i) > LANE_S32(b, i) ? 0xFFFFFFFFu : 0);
++      }
++      VCMP_DONE(4, 4); break;
++    case 967:  // vcmpgtsd
++    case 1991: // vcmpgtsd.
++      for (int i = 0; i < 2; i++) {
++        SET_LANE_U64(r, i,
++                     LANE_S64(a, i) > LANE_S64(b, i) ? UINT64_MAX : 0);
++      }
++      VCMP_DONE(2, 8); break;
++
++    // === Compare greater-than unsigned ===
++    case 518:  // vcmpgtub
++    case 1542: // vcmpgtub.
++      for (int i = 0; i < 16; i++) {
++        SET_LANE_U8(r, i, LANE_U8(a, i) > LANE_U8(b, i) ? 0xFF : 0);
++      }
++      VCMP_DONE(16, 1); break;
++    case 582:  // vcmpgtuh
++    case 1606: // vcmpgtuh.
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i, LANE_U16(a, i) > LANE_U16(b, i) ? 0xFFFF : 0);
++      }
++      VCMP_DONE(8, 2); break;
++    case 646:  // vcmpgtuw
++    case 1670: // vcmpgtuw.
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i,
++                     LANE_U32(a, i) > LANE_U32(b, i) ? 0xFFFFFFFFu : 0);
++      }
++      VCMP_DONE(4, 4); break;
++    case 711:  // vcmpgtud
++    case 1735: // vcmpgtud.
++      for (int i = 0; i < 2; i++) {
++        SET_LANE_U64(r, i,
++                     LANE_U64(a, i) > LANE_U64(b, i) ? UINT64_MAX : 0);
++      }
++      VCMP_DONE(2, 8); break;
++
++    // === Splat from immediate (5-bit signed splat into all lanes) ===
++    // ISA defines UIM in BE element numbering. For LE storage, BE element i = LE element (N-1-i).
++    case 524:  // vspltb: VRT[*] = VRB[BE-byte-UIM]; uimm from VRA field (bits 11..15)
++      for (int i = 0; i < 16; i++) {
++        SET_LANE_U8(r, i, LANE_U8(b, 15 - (uimm & 0xF)));
++      }
++      setVRBytes(vrt, r); break;
++    case 588:  // vsplth
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i, LANE_U16(b, 7 - (uimm & 0x7)));
++      }
++      setVRBytes(vrt, r); break;
++    case 652:  // vspltw
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i, LANE_U32(b, 3 - (uimm & 0x3)));
++      }
++      setVRBytes(vrt, r); break;
++
++    // === Splat 5-bit signed immediate to all byte lanes ===
++    case 780: {  // vspltisb VRT, SIMM5
++      int32_t simm5 = (int32_t)((instr->instructionBits() >> 16) & 0x1F);
++      if (simm5 & 0x10) simm5 |= ~0x1F;
++      uint8_t b = (uint8_t)(int8_t)simm5;
++      memset(r, b, 16);
++      setVRBytes(vrt, r); break;
++    }
++
++    // === Splat 5-bit signed immediate to all halfword lanes ===
++    case 844: {  // vspltish VRT, SIMM5
++      // SIMM5 occupies bits 11..15 of the instruction (VRA field). It
++      // is sign-extended to 16 bits and replicated across all 8 halfword
++      // lanes of VRT. Range: [-16, 15].
++      int32_t simm5 = (int32_t)((instr->instructionBits() >> 16) & 0x1F);
++      if (simm5 & 0x10) simm5 |= ~0x1F;  // sign-extend bit 4
++      int16_t hw = (int16_t)simm5;
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i, (uint16_t)hw);
++      }
++      setVRBytes(vrt, r); break;
++    }
++
++    // === Splat 5-bit signed immediate to all word lanes ===
++    case 908: {  // vspltisw VRT, SIMM5
++      int32_t simm5 = (int32_t)((instr->instructionBits() >> 16) & 0x1F);
++      if (simm5 & 0x10) simm5 |= ~0x1F;
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i, (uint32_t)simm5);
++      }
++      setVRBytes(vrt, r); break;
++    }
++
++    // === Merge (interleave) ===
++    //
++    // The ISA defines vmrgh* / vmrgl* in BE numbering; the
++    // empirical LE storage behaviour is:
++    //   vmrgh* VT,VA,VB: for i in 0..N/2-1,
++    //     VT.lane_LE[2i]   = VB.lane_LE[(N/2) + i]
++    //     VT.lane_LE[2i+1] = VA.lane_LE[(N/2) + i]
++    //   vmrgl* VT,VA,VB: for i in 0..N/2-1,
++    //     VT.lane_LE[2i]   = VB.lane_LE[i]
++    //     VT.lane_LE[2i+1] = VA.lane_LE[i]
++    // i.e. the VB operand goes to the even result positions (reversed
++    // from what a naïve BE reading would suggest) and the "high" form
++    // selects the upper-half of LE storage.
++    //
++    // Previous implementation had both the operand order swapped AND
++    // the high/low halves swapped (consistent with each other, so
++    // JIT-only-visible ops that round-tripped through vmrg* happened
++    // to produce the right answer, but wasm-visible extmul exposed
++    // the bug).
++    case 12:   // vmrghb
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U8(r, 2 * i, LANE_U8(b, 8 + i));
++        SET_LANE_U8(r, 2 * i + 1, LANE_U8(a, 8 + i));
++      }
++      setVRBytes(vrt, r); break;
++    case 76:   // vmrghh
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U16(r, 2 * i, LANE_U16(b, 4 + i));
++        SET_LANE_U16(r, 2 * i + 1, LANE_U16(a, 4 + i));
++      }
++      setVRBytes(vrt, r); break;
++    case 140:  // vmrghw
++      for (int i = 0; i < 2; i++) {
++        SET_LANE_U32(r, 2 * i, LANE_U32(b, 2 + i));
++        SET_LANE_U32(r, 2 * i + 1, LANE_U32(a, 2 + i));
++      }
++      setVRBytes(vrt, r); break;
++    case 268:  // vmrglb
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U8(r, 2 * i, LANE_U8(b, i));
++        SET_LANE_U8(r, 2 * i + 1, LANE_U8(a, i));
++      }
++      setVRBytes(vrt, r); break;
++    case 332:  // vmrglh
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U16(r, 2 * i, LANE_U16(b, i));
++        SET_LANE_U16(r, 2 * i + 1, LANE_U16(a, i));
++      }
++      setVRBytes(vrt, r); break;
++    case 396:  // vmrglw
++      for (int i = 0; i < 2; i++) {
++        SET_LANE_U32(r, 2 * i, LANE_U32(b, i));
++        SET_LANE_U32(r, 2 * i + 1, LANE_U32(a, i));
++      }
++      setVRBytes(vrt, r); break;
++
++    // === Per-lane shift left (count from VRB, low N bits per element) ===
++    case 260:  // vslb
++      for (int i = 0; i < 16; i++) {
++        SET_LANE_U8(r, i, LANE_U8(a, i) << (LANE_U8(b, i) & 7));
++      }
++      setVRBytes(vrt, r); break;
++    case 324:  // vslh
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i, LANE_U16(a, i) << (LANE_U16(b, i) & 15));
++      }
++      setVRBytes(vrt, r); break;
++    case 388:  // vslw
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i, LANE_U32(a, i) << (LANE_U32(b, i) & 31));
++      }
++      setVRBytes(vrt, r); break;
++    case 1476: // vsld
++      for (int i = 0; i < 2; i++) {
++        SET_LANE_U64(r, i, LANE_U64(a, i) << (LANE_U64(b, i) & 63));
++      }
++      setVRBytes(vrt, r); break;
++
++    // === Per-lane shift right unsigned ===
++    case 516:  // vsrb
++      for (int i = 0; i < 16; i++) {
++        SET_LANE_U8(r, i, LANE_U8(a, i) >> (LANE_U8(b, i) & 7));
++      }
++      setVRBytes(vrt, r); break;
++    case 580:  // vsrh
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i, LANE_U16(a, i) >> (LANE_U16(b, i) & 15));
++      }
++      setVRBytes(vrt, r); break;
++    case 644:  // vsrw
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i, LANE_U32(a, i) >> (LANE_U32(b, i) & 31));
++      }
++      setVRBytes(vrt, r); break;
++    case 1732: // vsrd
++      for (int i = 0; i < 2; i++) {
++        SET_LANE_U64(r, i, LANE_U64(a, i) >> (LANE_U64(b, i) & 63));
++      }
++      setVRBytes(vrt, r); break;
++
++    // === Per-lane shift right algebraic (signed) ===
++    case 772:  // vsrab
++      for (int i = 0; i < 16; i++) {
++        SET_LANE_U8(r, i,
++                    (uint8_t)(LANE_S8(a, i) >> (LANE_U8(b, i) & 7)));
++      }
++      setVRBytes(vrt, r); break;
++    case 836:  // vsrah
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i,
++                     (uint16_t)(LANE_S16(a, i) >> (LANE_U16(b, i) & 15)));
++      }
++      setVRBytes(vrt, r); break;
++    case 900:  // vsraw
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i,
++                     (uint32_t)(LANE_S32(a, i) >> (LANE_U32(b, i) & 31)));
++      }
++      setVRBytes(vrt, r); break;
++    case 964:  // vsrad
++      for (int i = 0; i < 2; i++) {
++        SET_LANE_U64(r, i,
++                     (uint64_t)(LANE_S64(a, i) >> (LANE_U64(b, i) & 63)));
++      }
++      setVRBytes(vrt, r); break;
++
++    // === POWER9 per-lane integer negate (subop in VRA field) ===
++    // PPC_vnegw = 0x10060602 → XO=0x602=1538, VRA=6
++    // PPC_vnegd = 0x10070602 → XO=0x602=1538, VRA=7
++    case 1538:
++      if (vra == 6) {  // vnegw
++        for (int i = 0; i < 4; i++) {
++          SET_LANE_U32(r, i, (uint32_t)(-LANE_S32(b, i)));
++        }
++      } else if (vra == 7) {  // vnegd
++        for (int i = 0; i < 2; i++) {
++          SET_LANE_U64(r, i, (uint64_t)(-LANE_S64(b, i)));
++        }
++      } else {
++        MOZ_CRASH_UNSAFE_PRINTF("decodeVMX XO=1538: unknown subop %u", vra);
++      }
++      setVRBytes(vrt, r); break;
++
++    // === POWER10 vextract{b,h,w,d}m (XO=1602=0x642) ===
++    // RT (GPR) gets the wasm-spec bitmask in low 16/8/4/2 bits. UIM at
++    // bits 11..15 (= sim `vra`) selects lane width: 8=byte, 9=halfword,
++    // 10=word, 11=doubleword.
++    case 1602: {
++      uint64_t result = 0;
++      switch (vra) {
++        case 8:  // vextractbm: 16 byte lanes
++          for (int i = 0; i < 16; i++) {
++            if (b[i] & 0x80) result |= (1ULL << i);
++          }
++          break;
++        case 9:  // vextracthm: 8 halfword lanes; MSB lives at byte 2i+1
++          for (int i = 0; i < 8; i++) {
++            if (b[2 * i + 1] & 0x80) result |= (1ULL << i);
++          }
++          break;
++        case 10:  // vextractwm: 4 word lanes; MSB at byte 4i+3
++          for (int i = 0; i < 4; i++) {
++            if (b[4 * i + 3] & 0x80) result |= (1ULL << i);
++          }
++          break;
++        case 11:  // vextractdm: 2 dword lanes; MSB at byte 8i+7
++          for (int i = 0; i < 2; i++) {
++            if (b[8 * i + 7] & 0x80) result |= (1ULL << i);
++          }
++          break;
++        default:
++          MOZ_CRASH_UNSAFE_PRINTF("decodeVMX XO=1602: unknown UIM %u", vra);
++      }
++      // vrt is the GPR target (RT field at bits 6..10).
++      setRegister(int(vrt), int64_t(result));
++      goto vmx_done;  // Skip the trailing setVRBytes used by VR-targeting ops.
++    }
++
++    // === POWER9 vinsertb (XO=781) / vinserth (XO=845) ===
++    // Insert byte/halfword from a VR (NOT a GPR) at an immediate byte
++    // position UIM (BE).
++    //   vinsertb: VRT.byte[UIM]   (BE) ← VRB.byte[7] (BE)
++    //   vinserth: VRT.byte[UIM]   (BE) ← VRB.byte[6] (BE)
++    //             VRT.byte[UIM+1] (BE) ← VRB.byte[7] (BE)
++    // BE byte i ↔ LE byte (15-i). So VRB.byte[6] (BE) = LE byte 9 of
++    // VRB, VRB.byte[7] (BE) = LE byte 8. (Byte-pair order matters.)
++    case 781:    // vinsertb
++    case 845: {  // vinserth
++      getVRBytes(vrt, r);  // start from current VRT
++      if (xo == 845) {
++        // vinserth: copy 2-byte halfword (BE bytes 6..7 of VRB).
++        r[15 - uimm]     = b[9];  // BE byte UIM   ← VRB BE byte 6
++        r[14 - uimm]     = b[8];  // BE byte UIM+1 ← VRB BE byte 7
++      } else {
++        // vinsertb: copy a single byte (BE byte 7 of VRB).
++        r[15 - uimm]     = b[8];  // BE byte UIM   ← VRB BE byte 7
++      }
++      setVRBytes(vrt, r); break;
++    }
++
++    // === POWER9 vextractub (XO=525) / vextractuh (XO=589) ===
++    // Extract one byte/halfword from VRB at immediate BE position UIM
++    // and place it at BE byte 7 of VRT, with all other bytes of VRT
++    // zeroed. Companion to vinsertb/h; chooses an immediate BE position
++    // and lands the result at the low byte of VRT (= low byte of mfvsrd).
++    //   vextractub: VRT.byte[7] (BE) ← VRB.byte[UIM] (BE), rest = 0
++    //   vextractuh: VRT.byte[6] (BE) ← VRB.byte[UIM]   (BE)
++    //               VRT.byte[7] (BE) ← VRB.byte[UIM+1] (BE), rest = 0
++    case 525:    // vextractub
++    case 589: {  // vextractuh
++      memset(r, 0, sizeof(r));
++      if (xo == 589) {
++        r[9] = b[15 - uimm];  // VRT BE byte 6 ← VRB BE byte UIM
++        r[8] = b[14 - uimm];  // VRT BE byte 7 ← VRB BE byte UIM+1
++      } else {
++        r[8] = b[15 - uimm];  // VRT BE byte 7 ← VRB BE byte UIM
++      }
++      setVRBytes(vrt, r); break;
++    }
++
++    // === POWER10 vinsbrx (XO=783) / vinshrx (XO=847) ===
++    // Right-indexed (LE-natural) byte/halfword insert from GPR. RA's
++    // low 4 bits supply the byte position (mod 16); for vinshrx the
++    // position is also masked to even (& 0xE) so the halfword is
++    // 2-byte aligned. RB's low 8 / 16 bits are inserted; other bytes
++    // of VRT are unchanged. RA and RB are GPRs (NOT VRs) — sim's
++    // pre-fetched `a` and `b` from getVRBytes are unused here.
++    case 783:    // vinsbrx
++    case 847: {  // vinshrx
++      uint64_t ra_val = U64(getRegister(int(vra)));
++      uint64_t rb_val = U64(getRegister(int(vrb)));
++      getVRBytes(vrt, r);  // start from current VRT
++      const bool isHalf = (xo == 847);
++      const uint32_t pos = isHalf ? uint32_t(ra_val & 0xEULL)
++                                  : uint32_t(ra_val & 0xFULL);
++      r[pos] = (uint8_t)(rb_val & 0xFFULL);
++      if (isHalf) {
++        r[pos + 1] = (uint8_t)((rb_val >> 8) & 0xFFULL);
++      }
++      setVRBytes(vrt, r); break;
++    }
++
++    // === POWER10 vinsw (XO=207) / vinsd (XO=463) ===
++    // VRT[UIM*8:UIM*8+N-1] (BE bits) ← RB low N bits, where N = 32 or 64.
++    // RB is a GPR (the `vrb` field at sim bits 15..11). UIM is at sim
++    // bits 20..16 (= the `uimm` / `vra` decode). Other bytes of VRT are
++    // unchanged, so we read VRT first then patch UIM..UIM+(N/8-1).
++    case 207:    // vinsw
++    case 463: {  // vinsd
++      uint64_t rb_val = U64(getRegister(int(vrb)));
++      getVRBytes(vrt, r);  // start from current VRT
++      const int width = (xo == 463) ? 8 : 4;  // bytes
++      // BE byte UIM+i of VRT = LE byte (15 - UIM - i).
++      // For vinsd, RB.dword[0] (BE) = bits 56..63 of rb_val (host LSB end
++      // of the GPR — recall U64() puts the canonical 64-bit value in a
++      // host uint64_t with bit 63 = MSB).
++      // For vinsw, source is RB[32:63] = low 32 bits of rb_val.
++      uint64_t src = (width == 8) ? rb_val : (rb_val & 0xFFFFFFFFULL);
++      const int srcMsbShift = (width * 8) - 8;  // 56 or 24
++      for (int i = 0; i < width; i++) {
++        r[15 - uimm - i] = (uint8_t)(src >> (srcMsbShift - 8 * i));
++      }
++      setVRBytes(vrt, r); break;
++    }
++
++    // === POWER8+ vbpermq (XO=1356=0x54C): per-byte bit permute ===
++    // For each i in 0..15, take VRB BE-byte i (= sim b[15-i]); if its
++    // high bit is set, perm[i]=0; else perm[i] = bit at BE position
++    // (low 7 bits) of VRA. ISA says perm[0..15] go into VRT.dw[1] low
++    // 16 bits, but on real LE silicon the bitmap is observable in dw[0]
++    // low 16 bits — i.e., recoverable via mfvsrd. Match that observable
++    // behaviour: write the bitmap into sim bytes[8..9] (where mfvsrd
++    // reads dw[0] from), zero the rest.
++    case 1356: {
++      uint8_t perm[16];
++      for (int k = 0; k < 16; k++) {
++        uint8_t ctl = b[15 - k];
++        if (ctl & 0x80) {
++          perm[k] = 0;
++        } else {
++          int p = ctl & 0x7F;
++          int le_idx = 15 - (p / 8);
++          int bit_in_byte = 7 - (p % 8);
++          perm[k] = (a[le_idx] >> bit_in_byte) & 1;
++        }
++      }
++      uint8_t lo = 0, hi = 0;
++      for (int k = 0; k < 8; k++) hi = (hi << 1) | perm[k];
++      for (int k = 8; k < 16; k++) lo = (lo << 1) | perm[k];
++      for (int i = 0; i < 16; i++) r[i] = 0;
++      r[8] = lo;
++      r[9] = hi;
++      setVRBytes(vrt, r); break;
++    }
++
++    // VA-form ops vmladduhm (XO=34), vsel (XO=42), vperm (XO=43) are
++    // peeled off in the pre-dispatch above (see "VA-form pre-dispatch"
++    // comment near the top of this function), since the 11-bit XO
++    // mask conflates VRC into the case label.
++
++    // === Unpack high signed (BE-numbering = LE indices 8..15) ===
++    // vupkhsb: VRT[i] = sign_extend_to_16(VRA[i+0..7]). On LE storage with
++    // BE-named "high" being the low-indexed bytes, vupkhsb sign-extends the
++    // low 8 bytes of VRA into 8 halfwords. PPC64LE wasm calls these the
++    // "high" lanes per PPC convention; the JIT compensates internally via
++    // the vupklsb/vupkhsb swap documented in MacroAssembler-ppc64-inl.h.
++    case 526:  // vupkhsb (high signed byte → halfword)
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i, (uint16_t)(int16_t)LANE_S8(b, 8 + i));
++      }
++      setVRBytes(vrt, r); break;
++    case 590:  // vupkhsh (high signed halfword → word)
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i, (uint32_t)(int32_t)LANE_S16(b, 4 + i));
++      }
++      setVRBytes(vrt, r); break;
++    case 1614: // vupkhsw (high signed word → dword) POWER8+
++      for (int i = 0; i < 2; i++) {
++        SET_LANE_U64(r, i, (uint64_t)(int64_t)LANE_S32(b, 2 + i));
++      }
++      setVRBytes(vrt, r); break;
++    case 654:  // vupklsb (low signed byte → halfword) — PPC LE: takes high lanes
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i, (uint16_t)(int16_t)LANE_S8(b, i));
++      }
++      setVRBytes(vrt, r); break;
++    case 718:  // vupklsh (low signed halfword → word)
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i, (uint32_t)(int32_t)LANE_S16(b, i));
++      }
++      setVRBytes(vrt, r); break;
++    case 1742: // vupklsw (low signed word → dword)
++      for (int i = 0; i < 2; i++) {
++        SET_LANE_U64(r, i, (uint64_t)(int64_t)LANE_S32(b, i));
++      }
++      setVRBytes(vrt, r); break;
++
++    // === Pack (saturate or modulo) ===
++    //
++    // vpk* definitions are BE-specified:
++    // VT.byte[0..7] = saturate(VA.halfword[0..7]), VT.byte[8..15] =
++    // saturate(VB.halfword[0..7]) (BE-numbered throughout). On
++    // PPC64LE register storage that inverts to: LE bytes 0-7 = VB's
++    // saturated halfwords, LE bytes 8-15 = VA's.
++    //
++    //   vpkshus = XO 270   (s16 → u8 sat)
++    //   vpkshss = XO 398   (s16 → s8 sat)
++    //   vpkswus = XO 334   (s32 → u16 sat)
++    //   vpkswss = XO 462   (s32 → s16 sat)
++    // The sim previously had three of these four labels rotated
++    // (270=vpkshss, 334=vpkshus, 398=vpkswus) so every i8x16/i16x8
++    // narrow_* call silently used the wrong saturation kind or
++    // lane width — vpkshss was completely absent.
++    case 398: { // vpkshss (signed halfword → signed byte)
++      for (int i = 0; i < 8; i++) {
++        int v = LANE_S16(b, i);
++        if (v > INT8_MAX) v = INT8_MAX;
++        if (v < INT8_MIN) v = INT8_MIN;
++        SET_LANE_U8(r, i, (uint8_t)(int8_t)v);
++      }
++      for (int i = 0; i < 8; i++) {
++        int v = LANE_S16(a, i);
++        if (v > INT8_MAX) v = INT8_MAX;
++        if (v < INT8_MIN) v = INT8_MIN;
++        SET_LANE_U8(r, 8 + i, (uint8_t)(int8_t)v);
++      }
++      setVRBytes(vrt, r); break;
++    }
++    case 462: { // vpkswss (signed word → signed halfword)
++      for (int i = 0; i < 4; i++) {
++        int64_t v = LANE_S32(b, i);
++        if (v > INT16_MAX) v = INT16_MAX;
++        if (v < INT16_MIN) v = INT16_MIN;
++        SET_LANE_U16(r, i, (uint16_t)(int16_t)v);
++      }
++      for (int i = 0; i < 4; i++) {
++        int64_t v = LANE_S32(a, i);
++        if (v > INT16_MAX) v = INT16_MAX;
++        if (v < INT16_MIN) v = INT16_MIN;
++        SET_LANE_U16(r, 4 + i, (uint16_t)(int16_t)v);
++      }
++      setVRBytes(vrt, r); break;
++    }
++    case 270: { // vpkshus (signed halfword → unsigned byte, sat)
++      for (int i = 0; i < 8; i++) {
++        int v = LANE_S16(b, i);
++        if (v > UINT8_MAX) v = UINT8_MAX;
++        if (v < 0) v = 0;
++        SET_LANE_U8(r, i, (uint8_t)v);
++      }
++      for (int i = 0; i < 8; i++) {
++        int v = LANE_S16(a, i);
++        if (v > UINT8_MAX) v = UINT8_MAX;
++        if (v < 0) v = 0;
++        SET_LANE_U8(r, 8 + i, (uint8_t)v);
++      }
++      setVRBytes(vrt, r); break;
++    }
++    case 334: { // vpkswus (signed word → unsigned halfword, sat)
++      for (int i = 0; i < 4; i++) {
++        int64_t v = LANE_S32(b, i);
++        if (v > UINT16_MAX) v = UINT16_MAX;
++        if (v < 0) v = 0;
++        SET_LANE_U16(r, i, (uint16_t)v);
++      }
++      for (int i = 0; i < 4; i++) {
++        int64_t v = LANE_S32(a, i);
++        if (v > UINT16_MAX) v = UINT16_MAX;
++        if (v < 0) v = 0;
++        SET_LANE_U16(r, 4 + i, (uint16_t)v);
++      }
++      setVRBytes(vrt, r); break;
++    }
++
++    // === POWER9 compare not-equal (vcmpne{b,h,w}) — Rc=0 and Rc=1 ===
++    case 7:    // vcmpneb
++    case 1031: // vcmpneb.
++      for (int i = 0; i < 16; i++) {
++        SET_LANE_U8(r, i, LANE_U8(a, i) != LANE_U8(b, i) ? 0xFF : 0);
++      }
++      VCMP_DONE(16, 1); break;
++    case 71:   // vcmpneh
++    case 1095: // vcmpneh.
++      for (int i = 0; i < 8; i++) {
++        SET_LANE_U16(r, i, LANE_U16(a, i) != LANE_U16(b, i) ? 0xFFFF : 0);
++      }
++      VCMP_DONE(8, 2); break;
++    case 135:  // vcmpnew
++    case 1159: // vcmpnew.
++      for (int i = 0; i < 4; i++) {
++        SET_LANE_U32(r, i,
++                     LANE_U32(a, i) != LANE_U32(b, i) ? 0xFFFFFFFFu : 0);
++      }
++      VCMP_DONE(4, 4); break;
++    #undef VCMP_DONE
++
++    // === Population count per byte (POWER8) ===
++    case 1795: { // vpopcntb (XO 0x703 = 1795). VRA field unused.
++      for (int i = 0; i < 16; i++) {
++        SET_LANE_U8(r, i, (uint8_t)__builtin_popcount(LANE_U8(b, i)));
++      }
++      setVRBytes(vrt, r); break;
++    }
++
++    // === vsldoi: VRT = (VRA || VRB) shifted left by SH bytes (SH at bits 22..25) ===
++    case 44: case 45: case 46: case 47: {
++      // SH is at bits 22..25 (PPC) → LSB bits 6..9 of the instruction →
++      // (instructionBits >> 6) & 0xF. Our XO mask already bottoms-out at
++      // bit 0, so extract from the raw instruction.
++      uint32_t sh = (instr->instructionBits() >> 6) & 0xF;
++      uint8_t cat[32];
++      memcpy(cat, a, 16);
++      memcpy(cat + 16, b, 16);
++      for (int i = 0; i < 16; i++) {
++        r[i] = cat[sh + i];
++      }
++      setVRBytes(vrt, r); break;
++    }
++
++
++    default:
++      MOZ_CRASH_UNSAFE_PRINTF(
++          "decodeVMX: unimplemented XO=%u (instruction 0x%08x)", xo,
++          instr->instructionBits());
++  }
++
++vmx_done:
++  #undef LANE_U8
++  #undef LANE_S8
++  #undef LANE_U16
++  #undef LANE_S16
++  #undef LANE_U32
++  #undef LANE_S32
++  #undef LANE_U64
++  #undef LANE_S64
++  #undef SET_LANE_U8
++  #undef SET_LANE_U16
++  #undef SET_LANE_U32
++  #undef SET_LANE_U64
++  ;  // empty stmt for label
++}
++
++// -----------------------------------------------------------------------------
++// decodeVSX: Major opcode 60 (XX1-form, XX2-form)
++// mfvsrd, mtvsrd, mtvsrwz, mtvsrws, xscvdpsp, xscvdpspn, xscvspdp,
++// xscvspdpn, xxbrd
++
++void Simulator::decodeVSX(SimInstruction* instr) {
++  // VSX major opcode 60 covers XX1/XX2/XX3/XX4 forms. We dispatch XX4
++  // (xxsel) first because its XO is only 2 bits (at ISA 26-27 = sim
++  // bits 5-4), and the XC register field at ISA 21-25 would otherwise
++  // produce 32 different 9-bit XO values to enumerate in the switch.
++  // Peel off any instruction with XX4 XO=3 (xxsel). No XX2/XX3 op currently
++  // emitted by the JIT has sim bits (5,4) == 3.
++  if (instr->bits(5, 4) == 3) {
++    // xxsel XT,XA,XB,XC  (VA-like XX4-form).
++    //   XT[i] = (XA[i] & ~XC[i]) | (XB[i] & XC[i])
++    // Register fields: XA/XB/XT per-byte; XC at ISA bits 21-25 (sim
++    // bits 10-6) with CX extension at ISA bit 28 (sim bit 3).
++    int xa = int(instr->raValue() | (instr->bit(2) << 5));
++    int xb = int(instr->rbValue() | (instr->bit(1) << 5));
++    int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++    int xc = int(instr->bits(10, 6) | (instr->bit(3) << 5));
++    uint8_t ab[16], bb[16], cb[16], result[16];
++    getVSR128(xa, ab);
++    getVSR128(xb, bb);
++    getVSR128(xc, cb);
++    for (int i = 0; i < 16; i++) {
++      result[i] = (uint8_t)((ab[i] & ~cb[i]) | (bb[i] & cb[i]));
++    }
++    setVSR128(xt, result);
++    return;
++  }
++
++  // The remaining forms (XX1/XX2/XX3) share a 9-bit XO at ISA bits
++  // 21-29 (sim bits 10-2). For XX3 this is (8-bit XO << 1) | AX; for
++  // XX2 the full 9 bits are the XO (no AX field).
++  uint32_t xo = instr->bits(10, 2);
++  uint32_t rt = instr->rtValue();
++  uint32_t rb = instr->rbValue();
++
++  switch (xo) {
++    // xscvdpsp / xscvdpspn / xscvspdp / xscvspdpn / xxbrd are
++    // XX2-form: XT/XB are each 6-bit (5-bit field + TX/BX extension at
++    // sim bits 0/1). Post-Phase-2 the JIT emits these with Simd128
++    // targets (encoding 32-63), which require the extension bit to
++    // select VR-space instead of FPR-space. The previous code used
++    // only the 5-bit field, so any VR-space target silently clobbered
++    // FPR 0..31 and the post-splat fbits in splatX4 never reached the
++    // vector lanes.
++    case 265: {
++      // xscvdpsp: double→single with sNaN quieting. The ISA says
++      // result lands at XT[0:31] (BE word 0 = LE bytes 12..15) and
++      // XT[32:127] is "undefined". Real POWER9 silicon actually
++      // duplicates the result into BE word 1 as well, so the bytes
++      // at LE 8..11 hold the same single. The JIT's
++      // replaceLaneFloat32x4 lowering depends on this: it follows
++      // xscvdpspn with `xxinsertw …, 12`, which reads XB.word[1]
++      // (LE bytes 8..11). Zeroing those bytes here would silently
++      // lose the single under sim. Mirror HW.
++      int xt = int(rt | (instr->bit(0) << 5));
++      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++      uint8_t bb[16];
++      getVSR128(xb, bb);
++      // Source double at BE DW0 = LE bytes 8..15 of xb.
++      uint64_t dbits = 0;
++      for (int i = 0; i < 8; i++) dbits |= ((uint64_t)bb[8 + i]) << (i * 8);
++      double frb;
++      memcpy(&frb, &dbits, sizeof(frb));
++      float result = demoteDoublePreservingNaN(frb);
++      uint32_t fbits;
++      memcpy(&fbits, &result, sizeof(fbits));
++      if ((fbits & 0x7F800000u) == 0x7F800000u && (fbits & 0x007FFFFFu) != 0) {
++        fbits |= 0x00400000u;
++      }
++      uint8_t out[16];
++      memset(out, 0, 8);
++      // BE word 1 (LE 8..11) and BE word 0 (LE 12..15) both = fbits.
++      for (int off : {8, 12}) {
++        out[off]     = (uint8_t)(fbits);
++        out[off + 1] = (uint8_t)(fbits >> 8);
++        out[off + 2] = (uint8_t)(fbits >> 16);
++        out[off + 3] = (uint8_t)(fbits >> 24);
++      }
++      setVSR128(xt, out);
++      break;
++    }
++    case 267: {
++      // xscvdpspn: same as xscvdpsp but non-signaling. Same HW-observed
++      // word-1 duplication (see xscvdpsp comment above).
++      int xt = int(rt | (instr->bit(0) << 5));
++      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++      uint8_t bb[16];
++      getVSR128(xb, bb);
++      uint64_t dbits = 0;
++      for (int i = 0; i < 8; i++) dbits |= ((uint64_t)bb[8 + i]) << (i * 8);
++      double frb;
++      memcpy(&frb, &dbits, sizeof(frb));
++      float result = demoteDoublePreservingNaN(frb);
++      uint32_t fbits;
++      memcpy(&fbits, &result, sizeof(fbits));
++      uint8_t out[16];
++      memset(out, 0, 8);
++      for (int off : {8, 12}) {
++        out[off]     = (uint8_t)(fbits);
++        out[off + 1] = (uint8_t)(fbits >> 8);
++        out[off + 2] = (uint8_t)(fbits >> 16);
++        out[off + 3] = (uint8_t)(fbits >> 24);
++      }
++      setVSR128(xt, out);
++      break;
++    }
++    case 393: {
++      // xvcvdpsp: convert two doubles to two singles, replicating each
++      // result across its dword. BE words = [s(BE_dw0), s(BE_dw0),
++      // s(BE_dw1), s(BE_dw1)]. SIGNALING form per ISA: sNaN inputs are
++      // quieted (high-order fraction bit set in result).
++      int xt = int(rt | (instr->bit(0) << 5));
++      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++      uint8_t bb[16], out[16];
++      getVSR128(xb, bb);
++      uint32_t fbits[2];
++      // BE_dw0 = LE bytes 8..15, BE_dw1 = LE bytes 0..7.
++      for (int dw = 0; dw < 2; dw++) {
++        int leOff = (dw == 0) ? 8 : 0;
++        uint64_t dbits = 0;
++        for (int i = 0; i < 8; i++) {
++          dbits |= ((uint64_t)bb[leOff + i]) << (i * 8);
++        }
++        double frb;
++        memcpy(&frb, &dbits, sizeof(frb));
++        float result = demoteDoublePreservingNaN(frb);
++        memcpy(&fbits[dw], &result, sizeof(uint32_t));
++        if ((fbits[dw] & 0x7F800000u) == 0x7F800000u &&
++            (fbits[dw] & 0x007FFFFFu) != 0) {
++          fbits[dw] |= 0x00400000u;  // quiet sNaN result
++        }
++      }
++      // LE words: [s(dw1), s(dw1), s(dw0), s(dw0)]
++      // (LE word 0 = BE word 3 = s(dw1); LE word 3 = BE word 0 = s(dw0)).
++      uint32_t leWords[4] = {fbits[1], fbits[1], fbits[0], fbits[0]};
++      for (int w = 0; w < 4; w++) {
++        out[w * 4]     = (uint8_t)leWords[w];
++        out[w * 4 + 1] = (uint8_t)(leWords[w] >> 8);
++        out[w * 4 + 2] = (uint8_t)(leWords[w] >> 16);
++        out[w * 4 + 3] = (uint8_t)(leWords[w] >> 24);
++      }
++      setVSR128(xt, out);
++      break;
++    }
++    case 216:    // xvcvdpsxws: double → signed word, saturating, RTZ (vector)
++    case 200: {  // xvcvdpuxws: double → unsigned word, saturating, RTZ (vector)
++      //   src1 := XB.dword_BE[0]; src2 := XB.dword_BE[1]
++      //   r1 := ConvertDPtoSat(src1); r2 := ConvertDPtoSat(src2)
++      //   XT.word_BE[0] := r1; XT.word_BE[1] := r1 (replicated)
++      //   XT.word_BE[2] := r2; XT.word_BE[3] := r2 (replicated)
++      // Saturation: signed clamps to [INT32_MIN, INT32_MAX] with NaN→INT32_MIN;
++      //             unsigned clamps to [0, UINT32_MAX] with NaN→0 and neg→0.
++      // BE_dw0 = LE bytes 8..15; BE_dw1 = LE bytes 0..7.
++      bool isSigned = (xo == 216);
++      int xt = int(rt | (instr->bit(0) << 5));
++      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++      uint8_t bb[16], out[16];
++      getVSR128(xb, bb);
++      const int srcOffsets[2] = {8, 0};   // BE_dw0 (LE 8..15), BE_dw1 (LE 0..7)
++      uint32_t results[2];
++      for (int lane = 0; lane < 2; lane++) {
++        uint64_t dbits = 0;
++        for (int j = 0; j < 8; j++) {
++          dbits |= ((uint64_t)bb[srcOffsets[lane] + j]) << (j * 8);
++        }
++        double dval;
++        memcpy(&dval, &dbits, sizeof(dval));
++        if (std::isnan(dval)) {
++          results[lane] = isSigned ? 0x80000000u : 0u;
++        } else if (isSigned) {
++          if (dval >= 2147483647.0) {
++            results[lane] = 0x7FFFFFFFu;
++          } else if (dval <= -2147483648.0) {
++            results[lane] = 0x80000000u;
++          } else {
++            results[lane] = (uint32_t)(int32_t)dval;  // RTZ
++          }
++        } else {  // unsigned
++          if (dval <= 0.0) {
++            results[lane] = 0u;
++          } else if (dval >= 4294967295.0) {
++            results[lane] = 0xFFFFFFFFu;
++          } else {
++            results[lane] = (uint32_t)dval;  // RTZ
++          }
++        }
++      }
++      // Replicated layout: BE words [r1, r1, r2, r2]; in LE bytes
++      // [r2, r2, r1, r1] (LE word 0 = BE word 3 = r2, LE word 3 = BE word 0 = r1).
++      uint32_t leWords[4] = {results[1], results[1], results[0], results[0]};
++      for (int w = 0; w < 4; w++) {
++        out[w * 4]     = (uint8_t)leWords[w];
++        out[w * 4 + 1] = (uint8_t)(leWords[w] >> 8);
++        out[w * 4 + 2] = (uint8_t)(leWords[w] >> 16);
++        out[w * 4 + 3] = (uint8_t)(leWords[w] >> 24);
++      }
++      setVSR128(xt, out);
++      break;
++    }
++    case 248:    // xvcvsxwdp: signed word → double (vector)
++    case 232: {  // xvcvuxwdp: unsigned word → double (vector)
++      //   src1 := XB.word_BE[0]; src2 := XB.word_BE[2]
++      //   XT.dword_BE[0] := Convert(src1); XT.dword_BE[1] := Convert(src2)
++      // BE word 0 = LE bytes 12..15; BE word 2 = LE bytes 4..7.
++      // Output BE dword 0 = LE bytes 8..15; BE dword 1 = LE bytes 0..7.
++      // No NaN handling needed (integer source).
++      bool isSigned = (xo == 248);
++      int xt = int(rt | (instr->bit(0) << 5));
++      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++      uint8_t bb[16], out[16];
++      getVSR128(xb, bb);
++      const int srcOffsets[2] = {12, 4};
++      const int dstOffsets[2] = {8, 0};
++      for (int lane = 0; lane < 2; lane++) {
++        uint32_t bits = (uint32_t)bb[srcOffsets[lane]] |
++                        ((uint32_t)bb[srcOffsets[lane] + 1] << 8) |
++                        ((uint32_t)bb[srcOffsets[lane] + 2] << 16) |
++                        ((uint32_t)bb[srcOffsets[lane] + 3] << 24);
++        double dval = isSigned ? (double)(int32_t)bits : (double)bits;
++        uint64_t dbits;
++        memcpy(&dbits, &dval, sizeof(dbits));
++        for (int i = 0; i < 8; i++) {
++          out[dstOffsets[lane] + i] = (uint8_t)(dbits >> (i * 8));
++        }
++      }
++      setVSR128(xt, out);
++      break;
++    }
++    case 457: {
++      // xvcvspdp: convert two singles to two doubles. SIGNALING form
++      // per ISA: sNaN inputs are quieted in the result (bit 51 set).
++      //   src1 := XB.word_BE[0]; src2 := XB.word_BE[2]
++      //   XT.dword_BE[0] := ConvertSPtoDP(src1)
++      //   XT.dword_BE[1] := ConvertSPtoDP(src2)
++      // BE word 0 = LE bytes 12..15; BE word 2 = LE bytes 4..7.
++      // Output BE dword 0 = LE bytes 8..15; BE dword 1 = LE bytes 0..7.
++      int xt = int(rt | (instr->bit(0) << 5));
++      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++      uint8_t bb[16], out[16];
++      getVSR128(xb, bb);
++      // src1 from BE word 0 (LE 12..15), output dword at LE 8..15.
++      // src2 from BE word 2 (LE 4..7),   output dword at LE 0..7.
++      const int srcOffsets[2] = {12, 4};   // LE byte offsets of word_BE[0], word_BE[2]
++      const int dstOffsets[2] = {8, 0};    // LE byte offsets of dword_BE[0], dword_BE[1]
++      for (int lane = 0; lane < 2; lane++) {
++        uint32_t fbits = (uint32_t)bb[srcOffsets[lane]] |
++                         ((uint32_t)bb[srcOffsets[lane] + 1] << 8) |
++                         ((uint32_t)bb[srcOffsets[lane] + 2] << 16) |
++                         ((uint32_t)bb[srcOffsets[lane] + 3] << 24);
++        float fval;
++        memcpy(&fval, &fbits, sizeof(fval));
++        double dval = promoteFloatPreservingNaN(fval);
++        uint64_t dbits;
++        memcpy(&dbits, &dval, sizeof(dbits));
++        if ((dbits & 0x7FF0000000000000ULL) == 0x7FF0000000000000ULL &&
++            (dbits & 0x000FFFFFFFFFFFFFULL) != 0) {
++          dbits |= 0x0008000000000000ULL;  // quiet sNaN result
++        }
++        for (int i = 0; i < 8; i++) {
++          out[dstOffsets[lane] + i] = (uint8_t)(dbits >> (i * 8));
++        }
++      }
++      setVSR128(xt, out);
++      break;
++    }
++    case 329: {
++      // xscvspdp: single→double from BE word 0 of XB. SIGNALING form;
++      // an sNaN input yields a qNaN result with the high-order
++      // fraction bit (quiet bit) set.
++      int xt = int(rt | (instr->bit(0) << 5));
++      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++      uint8_t bb[16];
++      getVSR128(xb, bb);
++      // BE word 0 = LE bytes 12..15 of xb.
++      uint32_t fbits = (uint32_t)bb[12] |
++                       ((uint32_t)bb[13] << 8) |
++                       ((uint32_t)bb[14] << 16) |
++                       ((uint32_t)bb[15] << 24);
++      float fval;
++      memcpy(&fval, &fbits, sizeof(fval));
++      double dval = promoteFloatPreservingNaN(fval);
++      uint64_t dbits;
++      memcpy(&dbits, &dval, sizeof(dbits));
++      // Quiet any NaN result (signaling form): set bit 51 of mantissa.
++      if ((dbits & 0x7FF0000000000000ULL) == 0x7FF0000000000000ULL &&
++          (dbits & 0x000FFFFFFFFFFFFFULL) != 0) {
++        dbits |= 0x0008000000000000ULL;
++      }
++      uint8_t out[16];
++      memset(out, 0, 8);
++      for (int i = 0; i < 8; i++) out[8 + i] = (uint8_t)(dbits >> (i * 8));
++      setVSR128(xt, out);
++      break;
++    }
++    case 331: {
++      // xscvspdpn: non-signaling variant of xscvspdp.
++      int xt = int(rt | (instr->bit(0) << 5));
++      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++      uint8_t bb[16];
++      getVSR128(xb, bb);
++      uint32_t fbits = (uint32_t)bb[12] |
++                       ((uint32_t)bb[13] << 8) |
++                       ((uint32_t)bb[14] << 16) |
++                       ((uint32_t)bb[15] << 24);
++      float fval;
++      memcpy(&fval, &fbits, sizeof(fval));
++      double dval = promoteFloatPreservingNaN(fval);
++      uint64_t dbits;
++      memcpy(&dbits, &dval, sizeof(dbits));
++      uint8_t out[16];
++      memset(out, 0, 8);
++      for (int i = 0; i < 8; i++) out[8 + i] = (uint8_t)(dbits >> (i * 8));
++      setVSR128(xt, out);
++      break;
++    }
++    case 347: {
++      // POWER9 XX2-form ops sharing XO=347; disambiguated by the 5-bit
++      // A immediate (sim bits 20..16):
++      //   A=0  -> xsxexpdp  (extract biased exponent into 11 LSBs of XT.dw0)
++      //   A=16 -> xscvhpdp  (FP16 -> FP64)
++      //   A=17 -> xscvdphp  (FP64 -> FP16)
++      // Half placement: the FP16 value lives at LE bytes 8..9 of
++      // the VSR (= BE bits 48..63 of
++      // dword[0]), with the rest of dword[0] zeroed. This matches the
++      // lxsihzx layout already used by the JIT.
++      uint32_t aImm = (instr->instructionBits() >> 16) & 0x1F;
++      int xt = int(rt | (instr->bit(0) << 5));
++      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++      uint8_t bb[16], out[16];
++      getVSR128(xb, bb);
++      memset(out, 0, 16);
++      if (aImm == 17) {
++        // xscvdphp: read FP64 from BE 0..63 of XB (LE bytes 8..15),
++        // convert to FP16, place at LE bytes 8..9 of XT.
++        double d;
++        memcpy(&d, bb + 8, 8);
++        uint16_t h = js::float16(d).toRawBits();
++        out[8] = (uint8_t)(h & 0xFF);
++        out[9] = (uint8_t)((h >> 8) & 0xFF);
++      } else if (aImm == 16) {
++        // xscvhpdp: read FP16 from LE bytes 8..9 of XB, convert to FP64,
++        // place at LE bytes 8..15 of XT.
++        uint16_t h = (uint16_t)bb[8] | ((uint16_t)bb[9] << 8);
++        double d = static_cast<double>(js::float16::fromRawBits(h));
++        memcpy(out + 8, &d, 8);
++      } else if (aImm == 0) {
++        // xsxexpdp: read FP64 from LE bytes 8..15 of XB, extract biased
++        // exponent (bits 1..11 of the IEEE-754 double = bits 52..62 of
++        // the 64-bit pattern), place into XT.dw0 with rest zeroed.
++        uint64_t bits = 0;
++        for (int i = 0; i < 8; i++) bits |= uint64_t(bb[8 + i]) << (i * 8);
++        uint64_t exp = (bits >> 52) & 0x7FF;
++        for (int i = 0; i < 8; i++) out[8 + i] = (uint8_t)(exp >> (i * 8));
++      } else {
++        MOZ_CRASH_UNSAFE_PRINTF(
++            "decodeVSX XO=347 with unexpected A=%u (instr 0x%08x)",
++            aImm, instr->instructionBits());
++      }
++      setVSR128(xt, out);
++      break;
++    }
++    case 475: {
++      // xxbrd: byte-reverse each doubleword.
++      int xt = int(rt | (instr->bit(0) << 5));
++      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++      uint8_t bb[16], out[16];
++      getVSR128(xb, bb);
++      for (int i = 0; i < 8; i++) out[i] = bb[7 - i];
++      for (int i = 0; i < 8; i++) out[8 + i] = bb[15 - i];
++      setVSR128(xt, out);
++      break;
++    }
++
++    // === XX3-form scalar: xsmaxjdp / xsminjdp (POWER9) ===
++    //
++    // xs{max,min}jdp XT, XA, XB. Scalar inputs at BE bits 0..63 of
++    // XA / XB (= LE bytes 8..15); result lands at BE 0..63 of XT
++    // (upper bits "undefined" per ISA).
++    //
++    // Semantics match ECMA-262 Math.{max,min} / wasm f64.{max,min}:
++    //   - NaN: if A is NaN return A; else if B is NaN return B. sNaN
++    //     payload preserved bit-for-bit (NOT quieted).
++    //   - ±0 tie: signed-zero ordering. xsmaxjdp returns +0 for any
++    //     mix of (-0, +0); xsminjdp returns -0.
++    //   - Otherwise: standard IEEE max / min.
++    case 288: case 289:    // xsmaxjdp  (XO8=144 → 9-bit 288/289)
++    case 304: case 305: {  // xsminjdp  (XO8=152 → 9-bit 304/305)
++      int xa = int(instr->raValue() | (instr->bit(2) << 5));
++      int xb = int(instr->rbValue() | (instr->bit(1) << 5));
++      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++      uint8_t ab[16], bb[16], out[16];
++      getVSR128(xa, ab);
++      getVSR128(xb, bb);
++      double a, b;
++      memcpy(&a, ab + 8, 8);
++      memcpy(&b, bb + 8, 8);
++      bool isMax = (xo >> 1) == 144;
++      double r;
++      if (std::isnan(a)) {
++        r = a;
++      } else if (std::isnan(b)) {
++        r = b;
++      } else if (a == 0.0 && b == 0.0) {
++        // Signed-zero ordering: max picks +0, min picks -0.
++        if (isMax) {
++          r = std::signbit(a) ? b : a;
++        } else {
++          r = std::signbit(a) ? a : b;
++        }
++      } else {
++        r = isMax ? std::max(a, b) : std::min(a, b);
++      }
++      memset(out, 0, 8);
++      memcpy(out + 8, &r, 8);
++      setVSR128(xt, out);
++      break;
++    }
++
++    // --- VSX XX3-form: xxpermdi ---
++    //
++    // xxpermdi XT, XA, XB, DM:
++    //   XT.DW0 = XA.DW(DM[0])
++    //   XT.DW1 = XB.DW(DM[1])
++    // In BE, DW0 is MSB-side, DW1 is LSB-side. On PPC64LE register
++    // storage, DW0 = LE bytes 8-15 and DW1 = LE bytes 0-7. The sim's
++    // previous implementation used the reversed "DW0 = LE 0-7"
++    // convention which cancelled for self-swap round-trips but
++    // produced wrong halves when chained with ISA-correct ops
++    // (mtvsrd, xxspltw, mfvsrd).
++    case 20: case 21:       // xxpermdi DM=0
++    case 84: case 85:       // xxpermdi DM=1
++    case 148: case 149:     // xxpermdi DM=2 (= xxswapd when XA==XB)
++    case 212: case 213: {   // xxpermdi DM=3
++      uint8_t dm_hi = (xo >> 7) & 1;  // DM[0]
++      uint8_t dm_lo = (xo >> 6) & 1;  // DM[1]
++      int xa = int(instr->raValue() | (instr->bit(2) << 5));
++      int xb = int(instr->rbValue() | (instr->bit(1) << 5));
++      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++      uint8_t xa_bytes[16], xb_bytes[16], result[16];
++      getVSR128(xa, xa_bytes);
++      getVSR128(xb, xb_bytes);
++      // DW0 in LE storage is bytes 8-15; DW1 is bytes 0-7.
++      //   XT.DW0 (result[8..15]) = XA.DW(dm_hi)
++      //   XT.DW1 (result[0..7])  = XB.DW(dm_lo)
++      // DW(0) is at LE 8, DW(1) is at LE 0.
++      memcpy(result + 8, xa_bytes + (dm_hi ? 0 : 8), 8);
++      memcpy(result,     xb_bytes + (dm_lo ? 0 : 8), 8);
++      setVSR128(xt, result);
++      break;
++    }
++
++    // --- VSX logical (XX3-form, primary opcode 60) ---
++    //
++    // Each takes two 6-bit VSR sources XA/XB and writes 6-bit VSR
++    // destination XT. 8-bit ISA XO at bits 21-28; our
++    // 9-bit XO extraction (bits 10:2) includes the AX bit at position 0,
++    // so each op appears as two consecutive values (AX=0 and AX=1).
++    //
++    //   xxland XT,XA,XB     XO=130  (9-bit: 260, 261)  XT = XA & XB
++    //   xxlandc XT,XA,XB    XO=138  (276, 277)         XT = XA & ~XB
++    //   xxlor XT,XA,XB      XO=146  (292, 293)         XT = XA | XB
++    //   xxlxor XT,XA,XB     XO=154  (308, 309)         XT = XA ^ XB
++    //   xxlnor XT,XA,XB     XO=162  (324, 325)         XT = ~(XA | XB)
++    //   xxlorc XT,XA,XB     XO=170  (340, 341)         XT = XA | ~XB
++    //   xxlnand XT,XA,XB    XO=178  (356, 357)         XT = ~(XA & XB)
++    //   xxleqv XT,XA,XB     XO=186  (372, 373)         XT = ~(XA ^ XB)
++    //
++    // The encoding constants in Assembler-ppc64.h match: PPC_xxlor=0xF0000490
++    // has bits 4,7,10 set in its base (XO=146 in the 8-bit field), which
++    // under the simulator's 9-bit extraction gives 2*146=292 (AX=0 default).
++    case 260: case 261:  // xxland
++    case 276: case 277:  // xxlandc
++    case 292: case 293:  // xxlor
++    case 308: case 309:  // xxlxor
++    case 324: case 325:  // xxlnor
++    case 340: case 341:  // xxlorc
++    case 356: case 357:  // xxlnand
++    case 372: case 373:  // xxleqv
++    {
++      int xa = int(instr->raValue() | (instr->bit(2) << 5));
++      int xb = int(instr->rbValue() | (instr->bit(1) << 5));
++      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++      uint8_t a_bytes[16], b_bytes[16], result[16];
++      getVSR128(xa, a_bytes);
++      getVSR128(xb, b_bytes);
++      // Dispatch on the 8-bit ISA XO (ignoring AX bit at position 0).
++      uint32_t xo8 = xo >> 1;
++      for (int i = 0; i < 16; i++) {
++        uint8_t a = a_bytes[i], b = b_bytes[i];
++        switch (xo8) {
++          case 130: result[i] = a & b;        break;  // xxland
++          case 138: result[i] = a & ~b;       break;  // xxlandc
++          case 146: result[i] = a | b;        break;  // xxlor
++          case 154: result[i] = a ^ b;        break;  // xxlxor
++          case 162: result[i] = (uint8_t)~(a | b);  break;  // xxlnor
++          case 170: result[i] = a | (uint8_t)~b;    break;  // xxlorc
++          case 178: result[i] = (uint8_t)~(a & b);  break;  // xxlnand
++          case 186: result[i] = (uint8_t)~(a ^ b);  break;  // xxleqv
++        }
++      }
++      setVSR128(xt, result);
++      break;
++    }
++
++    // === XX2-form: xxspltw (splat word from VRB[UIM] to all 4 lanes) ===
++    //
++    // xxspltw: UIM selects one of four words in BE numbering. UIM=0
++    // → BE word 0 (MSB side of the 128 bits). On PPC64LE register
++    // storage that maps to LE word (3 - UIM). With the input
++    // {0x11111111, 0x22222222, 0x33333333, 0x44444444}: UIM=0
++    // splats 0x44444444 (= LE word 3), UIM=3 splats 0x11111111
++    // (= LE word 0). The JIT emits xxspltw UIM=1 after mtvsrd on the
++    // POWER8 splatX4 path — mtvsrd puts the GPR's low 32 bits in BE
++    // word 1 (= LE word 2 on HW), so xxspltw UIM=1 picks up exactly
++    // that word and splats it to every lane.
++    case 164: {  // xxspltw
++      int xt = int(rt | (instr->bit(0) << 5));
++      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++      uint32_t uim = (instr->instructionBits() >> 16) & 0x3;
++      uint32_t leIdx = 3 - uim;  // BE word UIM → LE word (3-UIM)
++      uint8_t bb[16], result[16];
++      getVSR128(xb, bb);
++      uint32_t word = (uint32_t)bb[leIdx * 4] |
++                      ((uint32_t)bb[leIdx * 4 + 1] << 8) |
++                      ((uint32_t)bb[leIdx * 4 + 2] << 16) |
++                      ((uint32_t)bb[leIdx * 4 + 3] << 24);
++      for (int i = 0; i < 4; i++) {
++        result[i * 4]     = (uint8_t)(word & 0xFF);
++        result[i * 4 + 1] = (uint8_t)((word >> 8) & 0xFF);
++        result[i * 4 + 2] = (uint8_t)((word >> 16) & 0xFF);
++        result[i * 4 + 3] = (uint8_t)((word >> 24) & 0xFF);
++      }
++      setVSR128(xt, result);
++      break;
++    }
++
++    // === XX2-form: xxextractuw (extract word at BE byte UIM, place at BE word 1) ===
++    //
++    // xxextractuw XT, XB, UIM:
++    //   Bytes [4:7] of XT receive bytes [UIM:UIM+3] of XB. Bytes [0:3]
++    //   and [8:15] of XT are set to zero.
++    // UIM ∈ {0, 4, 8, 12} (caller responsible for alignment).
++    // BE byte i ↔ LE byte (15-i), so the word at XB BE bytes UIM..UIM+3
++    // sits at XB LE bytes (12-UIM)..(15-UIM), and lands at XT LE bytes
++    // 8..11 (= XT BE word 1).
++    case 165: {  // xxextractuw
++      int xt = int(rt | (instr->bit(0) << 5));
++      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++      uint32_t uim = (instr->instructionBits() >> 16) & 0xF;
++      uint8_t bb[16], result[16];
++      getVSR128(xb, bb);
++      memset(result, 0, sizeof(result));
++      // result.LE[8..11] = XB.LE[(12-UIM)..(15-UIM)] (preserves byte order).
++      memcpy(result + 8, bb + (12 - uim), 4);
++      setVSR128(xt, result);
++      break;
++    }
++
++    case 180: {
++      // xxspltib XT, IMM8 (POWER9, ISA 3.0): splat 8-bit immediate to
++      // all 16 bytes of XT. The encoder writes `imm8 << 11`, so IMM8
++      // occupies LE bits 11..18; TX bit at LE bit 0 selects upper VSR.
++      uint32_t imm8 = instr->bits(18, 11);
++      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++      uint8_t xt_bytes[16];
++      memset(xt_bytes, (uint8_t)imm8, 16);
++      setVSR128(xt, xt_bytes);
++      break;
++    }
++    case 181: {
++      // xxinsertw XT, XB, UIM (POWER9, ISA 3.0): copy XB[32..63] (the
++      // low 32 bits of XB's BE doubleword 0, which lives at LE bytes
++      // 8-11 of XB) into XT at BE byte position UIM. UIM ∈ {0,4,8,12};
++      // dest occupies XT LE bytes (12-UIM)..(15-UIM). Other bytes of
++      // XT are preserved. UIM at PPC bits 11-15 = LE bits 16-20; TX/BX
++      // at LE bits 0/1.
++      uint32_t uim = instr->bits(20, 16);
++      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++      int xb = int(instr->rbValue() | (instr->bit(1) << 5));
++      uint8_t xb_bytes[16], xt_bytes[16];
++      getVSR128(xb, xb_bytes);
++      getVSR128(xt, xt_bytes);
++      memcpy(xt_bytes + (12 - uim), xb_bytes + 8, 4);
++      setVSR128(xt, xt_bytes);
++      break;
++    }
++
++    // === XX2-form: xvabssp / xvabsdp (vector absolute value) ===
++    case 408: case 409: case 410: case 411: {  // xvabssp + AX/BX bits
++      int xt = int(rt | (instr->bit(0) << 5));
++      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++      uint8_t bb[16], result[16];
++      getVSR128(xb, bb);
++      for (int i = 0; i < 4; i++) {
++        uint32_t bits = (uint32_t)bb[i * 4] |
++                        ((uint32_t)bb[i * 4 + 1] << 8) |
++                        ((uint32_t)bb[i * 4 + 2] << 16) |
++                        ((uint32_t)bb[i * 4 + 3] << 24);
++        bits &= 0x7FFFFFFFu;  // clear sign bit
++        result[i * 4]     = (uint8_t)(bits & 0xFF);
++        result[i * 4 + 1] = (uint8_t)((bits >> 8) & 0xFF);
++        result[i * 4 + 2] = (uint8_t)((bits >> 16) & 0xFF);
++        result[i * 4 + 3] = (uint8_t)((bits >> 24) & 0xFF);
++      }
++      setVSR128(xt, result);
++      break;
++    }
++    case 472: case 473: case 474: {            // xvabsdp (475 used by xxbrd)
++      int xt = int(rt | (instr->bit(0) << 5));
++      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++      uint8_t bb[16], result[16];
++      getVSR128(xb, bb);
++      for (int i = 0; i < 2; i++) {
++        uint64_t bits = 0;
++        for (int k = 0; k < 8; k++) bits |= ((uint64_t)bb[i * 8 + k]) << (k * 8);
++        bits &= 0x7FFFFFFFFFFFFFFFULL;
++        for (int k = 0; k < 8; k++) result[i * 8 + k] = (uint8_t)((bits >> (k * 8)) & 0xFF);
++      }
++      setVSR128(xt, result);
++      break;
++    }
++
++    // === XX2-form unary vector float ops (single XB operand, no AX) ===
++    //
++    // Encoding: opcode 60, bits 6-10=XT, 11-15 reserved, 16-20=XB,
++    // 21-29 = 9-bit XO (full field), 30=BX, 31=TX. Extraction gives us
++    // xo = XO9 directly (no AX bit). Every op below has a unique XO9.
++    //
++    //   xvsqrtsp  XO9=139  PPC_xvsqrtsp=0xF000022C
++    //   xvsqrtdp  XO9=203  PPC_xvsqrtdp=0xF000032C
++    //   xvnegsp   XO9=441  PPC_xvnegsp=0xF00006E4
++    //   xvnegdp   XO9=505  PPC_xvnegdp=0xF00007E4
++    //   xvrspip   XO9=169  PPC_xvrspip=0xF00002A4   (round +inf = ceil)
++    //   xvrspiz   XO9=153  PPC_xvrspiz=0xF0000264   (round toward 0 = trunc)
++    //   xvrspim   XO9=185  PPC_xvrspim=0xF00002E4   (round -inf = floor)
++    //   xvrspic   XO9=171  PPC_xvrspic=0xF00002AC   (round per FPSCR)
++    //   xvrdpip   XO9=233  PPC_xvrdpip=0xF00003A4
++    //   xvrdpiz   XO9=217  PPC_xvrdpiz=0xF0000364
++    //   xvrdpim   XO9=249  PPC_xvrdpim=0xF00003E4
++    //   xvrdpic   XO9=235  PPC_xvrdpic=0xF00003AC
++    //   xvcvspsxws XO9=152 PPC_xvcvspsxws=0xF0000260  (f32 → s32, sat)
++    //   xvcvspuxws XO9=136 PPC_xvcvspuxws=0xF0000220  (f32 → u32, sat)
++    //   xvcvsxwsp XO9=184  PPC_xvcvsxwsp=0xF00002E0   (s32 → f32)
++    //   xvcvuxwsp XO9=168  PPC_xvcvuxwsp=0xF00002A0   (u32 → f32)
++    case 139: case 203:     // xvsqrtsp / xvsqrtdp
++    case 441: case 505:     // xvnegsp / xvnegdp
++    case 169: case 233:     // xvrspip / xvrdpip (ceil)
++    case 153: case 217:     // xvrspiz / xvrdpiz (trunc)
++    case 185: case 249:     // xvrspim / xvrdpim (floor)
++    case 171: case 235:     // xvrspic / xvrdpic (round-to-nearest)
++    case 136: case 152:     // xvcvspuxws / xvcvspsxws
++    case 168: case 184: {   // xvcvuxwsp / xvcvsxwsp
++      int xt = int(rt | (instr->bit(0) << 5));
++      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++      uint8_t bb[16], result[16];
++      getVSR128(xb, bb);
++      bool isSp = (xo == 139 || xo == 441 || xo == 169 || xo == 153 ||
++                   xo == 185 || xo == 171 || xo == 136 || xo == 152 ||
++                   xo == 168 || xo == 184);
++      auto getF32 = [](uint8_t* buf, int i) -> float {
++        uint32_t b = (uint32_t)buf[i * 4] |
++                     ((uint32_t)buf[i * 4 + 1] << 8) |
++                     ((uint32_t)buf[i * 4 + 2] << 16) |
++                     ((uint32_t)buf[i * 4 + 3] << 24);
++        float f; memcpy(&f, &b, sizeof(f)); return f;
++      };
++      auto setF32 = [](uint8_t* buf, int i, float f) {
++        uint32_t b; memcpy(&b, &f, sizeof(b));
++        buf[i*4]=(uint8_t)b; buf[i*4+1]=(uint8_t)(b>>8);
++        buf[i*4+2]=(uint8_t)(b>>16); buf[i*4+3]=(uint8_t)(b>>24);
++      };
++      auto getF64 = [](uint8_t* buf, int i) -> double {
++        uint64_t b = 0;
++        for (int k=0;k<8;k++) b |= ((uint64_t)buf[i*8+k])<<(k*8);
++        double d; memcpy(&d, &b, sizeof(d)); return d;
++      };
++      auto setF64 = [](uint8_t* buf, int i, double d) {
++        uint64_t b; memcpy(&b, &d, sizeof(b));
++        for (int k=0;k<8;k++) buf[i*8+k]=(uint8_t)(b>>(k*8));
++      };
++      // Integer lane read/write (used by conversion ops).
++      auto setU32 = [](uint8_t* buf, int i, uint32_t v) {
++        buf[i*4]=(uint8_t)v; buf[i*4+1]=(uint8_t)(v>>8);
++        buf[i*4+2]=(uint8_t)(v>>16); buf[i*4+3]=(uint8_t)(v>>24);
++      };
++      // Saturated float→int conversion per Power ISA v3.0B: input NaN maps
++      // to 0; out-of-range saturates to the extreme of the destination type.
++      auto fp2sxw = [](double f) -> uint32_t {
++        if (std::isnan(f)) return 0;
++        if (f >= (double)INT32_MAX) return (uint32_t)INT32_MAX;
++        if (f <= (double)INT32_MIN) return (uint32_t)INT32_MIN;
++        return (uint32_t)(int32_t)std::trunc(f);
++      };
++      auto fp2uxw = [](double f) -> uint32_t {
++        if (std::isnan(f)) return 0;
++        if (f >= (double)UINT32_MAX) return UINT32_MAX;
++        if (f <= 0.0) return 0;
++        return (uint32_t)std::trunc(f);
++      };
++
++      if (isSp) {
++        for (int i = 0; i < 4; i++) {
++          float v = getF32(bb, i);
++          float out = 0.0f;
++          uint32_t iout = 0;
++          bool isInt = false;
++          switch (xo) {
++            case 139: out = std::sqrt(v); break;                // xvsqrtsp
++            case 441: out = -v; break;                          // xvnegsp
++            case 169: out = std::ceil(v); break;                // xvrspip
++            case 153: out = std::trunc(v); break;               // xvrspiz
++            case 185: out = std::floor(v); break;               // xvrspim
++            case 171: out = std::nearbyint(v); break;           // xvrspic
++            case 152: iout = fp2sxw(v); isInt = true; break;    // xvcvspsxws
++            case 136: iout = fp2uxw(v); isInt = true; break;    // xvcvspuxws
++            case 184: {                                          // xvcvsxwsp
++              uint32_t bits = (uint32_t)bb[i*4] |
++                              ((uint32_t)bb[i*4+1]<<8) |
++                              ((uint32_t)bb[i*4+2]<<16) |
++                              ((uint32_t)bb[i*4+3]<<24);
++              out = (float)(int32_t)bits;
++              break;
++            }
++            case 168: {                                          // xvcvuxwsp
++              uint32_t bits = (uint32_t)bb[i*4] |
++                              ((uint32_t)bb[i*4+1]<<8) |
++                              ((uint32_t)bb[i*4+2]<<16) |
++                              ((uint32_t)bb[i*4+3]<<24);
++              out = (float)(uint32_t)bits;
++              break;
++            }
++          }
++          if (isInt) setU32(result, i, iout);
++          else setF32(result, i, out);
++        }
++      } else {
++        for (int i = 0; i < 2; i++) {
++          double v = getF64(bb, i);
++          double out = 0.0;
++          switch (xo) {
++            case 203: out = std::sqrt(v); break;                // xvsqrtdp
++            case 505: out = -v; break;                          // xvnegdp
++            case 233: out = std::ceil(v); break;                // xvrdpip
++            case 217: out = std::trunc(v); break;               // xvrdpiz
++            case 249: out = std::floor(v); break;               // xvrdpim
++            case 235: out = std::nearbyint(v); break;           // xvrdpic
++          }
++          setF64(result, i, out);
++        }
++      }
++      setVSR128(xt, result);
++      break;
++    }
++
++    // === XX3-form vector float compare (eq, gt, ge) ===
++    // The wasm SIMD compares emit these and use the result as a bitmask.
++    // Per Power ISA: result is all-1s for true lanes, all-0s for false
++    // (for the non-recording form; bit 0 of XO selects record form which
++    // we don't model — wasm doesn't read CR6 here).
++    // Encodings:
++    //   0xF0000218 xvcmpeqsp (XO8=67) → XO9 = 134/135 (+AX).
++    //   0xF0000258 xvcmpgtsp (XO8=75) → XO9 = 150/151.
++    //   0xF0000298 xvcmpgesp (XO8=83) → XO9 = 166/167.
++    //   0xF0000318 xvcmpeqdp (XO8=99) → XO9 = 198/199.
++    //   0xF0000358 xvcmpgtdp (XO8=107) → XO9 = 214/215.
++    //   0xF0000398 xvcmpgedp (XO8=115) → XO9 = 230/231.
++    // Rc=1 record form flips ISA bit 21 (sim bit 10), yielding XO9+256
++    // (not adjacent to the Rc=0 slot). wasm never emits the record form.
++    case 134: case 135:    // xvcmpeqsp (XO8=67)
++    case 198: case 199:    // xvcmpeqdp (XO8=99)
++    case 150: case 151:    // xvcmpgtsp (XO8=75)
++    case 214: case 215:    // xvcmpgtdp (XO8=107)
++    case 166: case 167:    // xvcmpgesp (XO8=83)
++    case 230: case 231: {  // xvcmpgedp (XO8=115)
++      int xt = int(rt | (instr->bit(0) << 5));
++      uint32_t ra = instr->raValue();
++      int xa = int(ra | ((instr->instructionBits() >> 2) & 1) << 5);
++      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++      uint8_t ab[16], bb[16], result[16];
++      getVSR128(xa, ab);
++      getVSR128(xb, bb);
++      uint32_t op8 = xo >> 1;  // canonical 8-bit XO
++      bool isF32 = (op8 == 67 || op8 == 75 || op8 == 83);
++      bool isEq  = (op8 == 67 || op8 == 99);
++      bool isGt  = (op8 == 75 || op8 == 107);
++      bool isGe  = (op8 == 83 || op8 == 115);
++      (void)isGe;
++      auto cmpF32 = [&](int i) -> bool {
++        uint32_t aBits = (uint32_t)ab[i * 4] |
++                         ((uint32_t)ab[i * 4 + 1] << 8) |
++                         ((uint32_t)ab[i * 4 + 2] << 16) |
++                         ((uint32_t)ab[i * 4 + 3] << 24);
++        uint32_t bBits = (uint32_t)bb[i * 4] |
++                         ((uint32_t)bb[i * 4 + 1] << 8) |
++                         ((uint32_t)bb[i * 4 + 2] << 16) |
++                         ((uint32_t)bb[i * 4 + 3] << 24);
++        float fa, fb;
++        memcpy(&fa, &aBits, sizeof(fa));
++        memcpy(&fb, &bBits, sizeof(fb));
++        if (isEq) return fa == fb;
++        if (isGt) return fa > fb;
++        return fa >= fb;
++      };
++      auto cmpF64 = [&](int i) -> bool {
++        uint64_t aBits = 0, bBits = 0;
++        for (int k = 0; k < 8; k++) aBits |= ((uint64_t)ab[i * 8 + k]) << (k * 8);
++        for (int k = 0; k < 8; k++) bBits |= ((uint64_t)bb[i * 8 + k]) << (k * 8);
++        double fa, fb;
++        memcpy(&fa, &aBits, sizeof(fa));
++        memcpy(&fb, &bBits, sizeof(fb));
++        if (isEq) return fa == fb;
++        if (isGt) return fa > fb;
++        return fa >= fb;
++      };
++      if (isF32) {
++        for (int i = 0; i < 4; i++) {
++          uint32_t mask = cmpF32(i) ? 0xFFFFFFFFu : 0;
++          for (int k = 0; k < 4; k++) {
++            result[i * 4 + k] = (uint8_t)((mask >> (k * 8)) & 0xFF);
++          }
++        }
++      } else {
++        for (int i = 0; i < 2; i++) {
++          uint64_t mask = cmpF64(i) ? UINT64_MAX : 0;
++          for (int k = 0; k < 8; k++) {
++            result[i * 8 + k] = (uint8_t)((mask >> (k * 8)) & 0xFF);
++          }
++        }
++      }
++      setVSR128(xt, result);
++      break;
++    }
++
++    // === XX3-form vector float arithmetic ===
++    // Encoding: bits 6-10=XT, 11-15=XA, 16-20=XB, 21-28=XO (8 bits), 29=AX,
++    // 30=BX, 31=TX. We dispatched above using `bits(10, 2)` which is bits
++    // 21-29 (9 bits) — that includes the AX register-extension bit, which
++    // changes for every XA in {0..31} vs {32..63}. To match all 4
++    // (AX,BX) combinations of an XX3 op we use `case xo3 | 0|1|2|3` where
++    // xo3 = (8-bit XO) << 1 (because XO occupies bits 1..8 of our 9-bit
++    // extraction). Helper macro: each case covers four labels.
++    #define XX3_CASE_BASE(name) \
++      case ((name) | 0): case ((name) | 1):
++    case 128:  case 129:  // xvaddsp: 4 × f32 add (XO=64 → bits 1..8 = 128)
++    case 192:  case 193:  // xvadddp
++    case 144:  case 145:  // xvsubsp
++    case 208:  case 209:  // xvsubdp
++    case 160:  case 161:  // xvmulsp
++    case 224:  case 225:  // xvmuldp
++    case 176:  case 177:  // xvdivsp
++    case 240:  case 241:  // xvdivdp
++    case 384:  case 385:  // xvmaxsp
++    case 448:  case 449:  // xvmaxdp
++    case 400:  case 401:  // xvminsp
++    case 464:  case 465:  // xvmindp
++    {
++      // Re-extract the canonical 8-bit XX3 XO.
++      uint32_t xo3 = (xo >> 1);
++      (void)xo3;
++      int xt = int(rt | (instr->bit(0) << 5));
++      uint32_t ra = instr->raValue();
++      int xa = int(ra | ((instr->instructionBits() >> 2) & 1) << 5);
++      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++      uint8_t ab[16], bb[16], rb_bytes[16];
++      getVSR128(xa, ab);
++      getVSR128(xb, bb);
++
++      auto getF32 = [](uint8_t* buf, int i) -> float {
++        uint32_t bits = (uint32_t)buf[i * 4] |
++                        ((uint32_t)buf[i * 4 + 1] << 8) |
++                        ((uint32_t)buf[i * 4 + 2] << 16) |
++                        ((uint32_t)buf[i * 4 + 3] << 24);
++        float f;
++        memcpy(&f, &bits, sizeof(f));
++        return f;
++      };
++      auto setF32 = [](uint8_t* buf, int i, float f) {
++        uint32_t bits;
++        memcpy(&bits, &f, sizeof(bits));
++        buf[i * 4]     = (uint8_t)(bits & 0xFF);
++        buf[i * 4 + 1] = (uint8_t)((bits >> 8) & 0xFF);
++        buf[i * 4 + 2] = (uint8_t)((bits >> 16) & 0xFF);
++        buf[i * 4 + 3] = (uint8_t)((bits >> 24) & 0xFF);
++      };
++      auto getF64 = [](uint8_t* buf, int i) -> double {
++        uint64_t bits = 0;
++        for (int k = 0; k < 8; k++) bits |= ((uint64_t)buf[i * 8 + k]) << (k * 8);
++        double d;
++        memcpy(&d, &bits, sizeof(d));
++        return d;
++      };
++      auto setF64 = [](uint8_t* buf, int i, double d) {
++        uint64_t bits;
++        memcpy(&bits, &d, sizeof(bits));
++        for (int k = 0; k < 8; k++) buf[i * 8 + k] = (uint8_t)((bits >> (k * 8)) & 0xFF);
++      };
++
++      // Dispatch on the canonical 8-bit XX3 XO (bits 21..28 PPC = xo>>1).
++      switch (xo3) {
++        case 64:  for (int i = 0; i < 4; i++) setF32(rb_bytes, i, getF32(ab, i) + getF32(bb, i)); break;  // xvaddsp
++        case 96:  for (int i = 0; i < 2; i++) setF64(rb_bytes, i, getF64(ab, i) + getF64(bb, i)); break;  // xvadddp
++        case 72:  for (int i = 0; i < 4; i++) setF32(rb_bytes, i, getF32(ab, i) - getF32(bb, i)); break;  // xvsubsp
++        case 104: for (int i = 0; i < 2; i++) setF64(rb_bytes, i, getF64(ab, i) - getF64(bb, i)); break;  // xvsubdp
++        case 80:  for (int i = 0; i < 4; i++) setF32(rb_bytes, i, getF32(ab, i) * getF32(bb, i)); break;  // xvmulsp
++        case 112: for (int i = 0; i < 2; i++) setF64(rb_bytes, i, getF64(ab, i) * getF64(bb, i)); break;  // xvmuldp
++        case 88:  for (int i = 0; i < 4; i++) setF32(rb_bytes, i, getF32(ab, i) / getF32(bb, i)); break;  // xvdivsp
++        case 120: for (int i = 0; i < 2; i++) setF64(rb_bytes, i, getF64(ab, i) / getF64(bb, i)); break;  // xvdivdp
++        // xvmin{sp,dp} / xvmax{sp,dp}:
++        //   If both operands are NaN, result is the NaN from XA.
++        //   If exactly one operand is NaN, result is the NON-NaN operand.
++        //   For 0 / -0, treat -0 < +0 (signed-zero ordering): xvminsp(+0,-0)
++        //   = -0, xvmaxsp(+0,-0) = +0, in either operand order.
++        //   Otherwise, result is IEEE min/max(a, b).
++        // This differs from IEEE 754 (which propagates NaN) and is
++        // relied upon by wasm relaxed_min/max (bug1946618.js) and by
++        // wasm f32x4.min(0,-0) returning -0 (simd_f32x4.wast.js).
++        #define XV_MAX(T, a, b) [](T a_, T b_) -> T {                          \
++          bool an = std::isnan(a_), bn = std::isnan(b_);                       \
++          if (an && bn) return a_;                                              \
++          if (an) return b_;                                                    \
++          if (bn) return a_;                                                    \
++          if (a_ == 0.0 && b_ == 0.0) {                                         \
++            /* -0 is smaller than +0; max picks +0. */                          \
++            return std::signbit(a_) ? b_ : a_;                                  \
++          }                                                                     \
++          return std::max(a_, b_);                                              \
++        }(a, b)
++        #define XV_MIN(T, a, b) [](T a_, T b_) -> T {                          \
++          bool an = std::isnan(a_), bn = std::isnan(b_);                       \
++          if (an && bn) return a_;                                              \
++          if (an) return b_;                                                    \
++          if (bn) return a_;                                                    \
++          if (a_ == 0.0 && b_ == 0.0) {                                         \
++            /* -0 is smaller than +0; min picks -0. */                          \
++            return std::signbit(a_) ? a_ : b_;                                  \
++          }                                                                     \
++          return std::min(a_, b_);                                              \
++        }(a, b)
++        case 192: for (int i = 0; i < 4; i++) {  // xvmaxsp
++          float a = getF32(ab, i), b = getF32(bb, i);
++          setF32(rb_bytes, i, XV_MAX(float, a, b));
++        } break;
++        case 224: for (int i = 0; i < 2; i++) {  // xvmaxdp
++          double a = getF64(ab, i), b = getF64(bb, i);
++          setF64(rb_bytes, i, XV_MAX(double, a, b));
++        } break;
++        case 200: for (int i = 0; i < 4; i++) {  // xvminsp
++          float a = getF32(ab, i), b = getF32(bb, i);
++          setF32(rb_bytes, i, XV_MIN(float, a, b));
++        } break;
++        case 232: for (int i = 0; i < 2; i++) {  // xvmindp
++          double a = getF64(ab, i), b = getF64(bb, i);
++          setF64(rb_bytes, i, XV_MIN(double, a, b));
++        } break;
++        #undef XV_MAX
++        #undef XV_MIN
++        default:
++          MOZ_CRASH_UNSAFE_PRINTF(
++              "xv float dispatch missing 8-bit XO=%u (instr 0x%08x)",
++              xo3, instr->instructionBits());
++      }
++      setVSR128(xt, rb_bytes);
++      break;
++    }
++
++    // === XX3-form fused multiply-add (3-source: XT is also input) ===
++    //
++    //   xvmaddasp XT,XA,XB:  XT = (XA * XB) + XT       (fused madd)
++    //   xvmaddadp XT,XA,XB:  same for f64
++    //   xvnmsubasp XT,XA,XB: XT = -((XA * XB) - XT) = XT - (XA * XB)
++    //   xvnmsubadp XT,XA,XB: same for f64
++    //
++    // Encodings (each +AX): XO8 → XO9 pairs
++    //   xvmaddasp  PPC_xvmaddasp=0xF0000208   XO8=65  → XO9 130/131
++    //   xvmaddadp  PPC_xvmaddadp=0xF0000308   XO8=97  → XO9 194/195
++    //   xvnmsubasp PPC_xvnmsubasp=0xF0000688  XO8=209 → XO9 418/419
++    //   xvnmsubadp PPC_xvnmsubadp=0xF0000788  XO8=241 → XO9 482/483
++    // std::fma gives IEEE-correct single-rounding behaviour matching the
++    // Power ISA definition of these fused forms.
++    case 130: case 131:      // xvmaddasp
++    case 194: case 195:      // xvmaddadp
++    case 418: case 419:      // xvnmsubasp
++    case 482: case 483: {    // xvnmsubadp
++      int xt = int(rt | (instr->bit(0) << 5));
++      uint32_t ra = instr->raValue();
++      int xa = int(ra | ((instr->instructionBits() >> 2) & 1) << 5);
++      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++      uint8_t ab[16], bb[16], tb[16];
++      getVSR128(xa, ab);
++      getVSR128(xb, bb);
++      getVSR128(xt, tb);  // XT is also an input (accumulator).
++      bool isSp = (xo == 130 || xo == 131 || xo == 418 || xo == 419);
++      bool isNmsub = (xo == 418 || xo == 419 || xo == 482 || xo == 483);
++      auto rdF32 = [](uint8_t* buf, int i) -> float {
++        uint32_t b = (uint32_t)buf[i * 4] |
++                     ((uint32_t)buf[i * 4 + 1] << 8) |
++                     ((uint32_t)buf[i * 4 + 2] << 16) |
++                     ((uint32_t)buf[i * 4 + 3] << 24);
++        float f; memcpy(&f, &b, sizeof(f)); return f;
++      };
++      auto wrF32 = [](uint8_t* buf, int i, float f) {
++        uint32_t b; memcpy(&b, &f, sizeof(b));
++        buf[i*4]=(uint8_t)b; buf[i*4+1]=(uint8_t)(b>>8);
++        buf[i*4+2]=(uint8_t)(b>>16); buf[i*4+3]=(uint8_t)(b>>24);
++      };
++      auto rdF64 = [](uint8_t* buf, int i) -> double {
++        uint64_t b = 0;
++        for (int k=0;k<8;k++) b |= ((uint64_t)buf[i*8+k])<<(k*8);
++        double d; memcpy(&d, &b, sizeof(d)); return d;
++      };
++      auto wrF64 = [](uint8_t* buf, int i, double d) {
++        uint64_t b; memcpy(&b, &d, sizeof(b));
++        for (int k=0;k<8;k++) buf[i*8+k]=(uint8_t)(b>>(k*8));
++      };
++      uint8_t result[16];
++      if (isSp) {
++        for (int i = 0; i < 4; i++) {
++          float a = rdF32(ab, i), b = rdF32(bb, i), t = rdF32(tb, i);
++          // madd:  t + a*b ;  nmsub: -(a*b - t) = t - a*b = std::fma(a,b,-t) negated.
++          float out = isNmsub ? -std::fma(a, b, -t)
++                              :  std::fma(a, b, t);
++          wrF32(result, i, out);
++        }
++      } else {
++        for (int i = 0; i < 2; i++) {
++          double a = rdF64(ab, i), b = rdF64(bb, i), t = rdF64(tb, i);
++          double out = isNmsub ? -std::fma(a, b, -t)
++                               :  std::fma(a, b, t);
++          wrF64(result, i, out);
++        }
++      }
++      setVSR128(xt, result);
++      break;
++    }
++
++    default:
++      MOZ_CRASH_UNSAFE_PRINTF(
++          "decodeVSX: unimplemented XO=%u (instruction 0x%08x)", xo,
++          instr->instructionBits());
++  }
++}
++
++// =============================================================================
++// Power ISA v3.1 prefixed instructions (POWER10).
++// =============================================================================
++//
++// A prefixed instruction is 8 bytes: a 4-byte prefix word (primary opcode 1)
++// followed by a 4-byte suffix word. Prefix and suffix must lie in the same
++// 64-byte aligned block — the JIT must guarantee this when emitting; the sim
++// asserts.
++//
++// Prefix word layout (BE bit numbering):
++//   [0..5]   primary opcode = 1
++//   [6..7]   Type (00 = 8LS, 10 = MLS — only forms we implement)
++//   [8..10]  reserved (must be 0)
++//   [11]     R (1 = PC-relative; RA must be 0)
++//   [12..13] reserved (must be 0)
++//   [14..31] d0 (high 18 bits of the 34-bit signed immediate)
++//
++// Suffix word (MLS/8LS form, GPR-target instructions like paddi/pld):
++//   [0..5]   suffix primary opcode (selects the actual instruction)
++//   [6..10]  RT (or RS for stores)
++//   [11..15] RA
++//   [16..31] d1 (low 16 bits of immediate)
++//
++// Suffix word (8LS plxv quirk): the suffix opcode field is only 5 bits
++// wide and bit [5] holds TX, the high bit of the 6-bit XT VSR number:
++//   [0..4]   plxv suffix opcode = 11001 (= 25)
++//   [5]      TX
++//   [6..10]  T
++//   [11..15] RA
++//   [16..31] d1
++// Combined: XT = (TX << 5) | T. (Equivalent: full 6-bit field at [0..5]
++// is 0b11001(TX) — values 50 or 51 in our LE bits 31..26.)
++//
++// Combined immediate: SI = sign_extend((d0 << 16) | d1, 34).
++// EA when R=1: address-of-prefix + SI. (RA must be 0.)
++// EA when R=0: (RA == 0 ? 0 : GPR[RA]) + SI.
++//
++// Suffix opcodes implemented here:
++//   MLS (Type 2) / suffix=14  paddi
++//   MLS (Type 2) / suffix=48  plfs   (load FP single, widens to double)
++//   MLS (Type 2) / suffix=50  plfd   (load FP double)
++//   8LS (Type 0) / suffix=57  pld
++//   8LS (Type 0) / 5-bit suffix=25, bit 26 = TX  plxv
++//
++// Verification recipe when adding more: assemble with `gcc -mcpu=power10
++// -c` (or clang) and compare the emitted bytes against the encoder; encode
++// in a small inline-asm program and step through under this sim.
++
++void Simulator::decodePrefixed(SimInstruction* prefix) {
++  // Prefix and suffix must reside in the same 64-byte block.
++  uint64_t prefixAddr = reinterpret_cast<uint64_t>(prefix);
++  MOZ_ASSERT((prefixAddr & 63) <= 56,
++             "POWER10 prefixed instruction crosses 64-byte boundary");
++
++  SimInstruction* suffix = reinterpret_cast<SimInstruction*>(
++      reinterpret_cast<uint8_t*>(prefix) + SimInstruction::kInstrSize);
++
++  uint32_t type = prefix->bits(25, 24);
++  uint32_t R = prefix->bit(20);
++  uint32_t d0 = prefix->bits(17, 0);  // 18 bits
++  uint32_t suffixOp6 = suffix->bits(31, 26);  // 6-bit form (paddi, pld)
++  uint32_t suffixOp5 = suffix->bits(31, 27);  // 5-bit form (plxv)
++  uint32_t plxvTX = suffix->bit(26);
++  uint32_t rt = suffix->rtValue();
++  uint32_t ra = suffix->raValue();
++  uint32_t d1 = suffix->uimm16Value();
++
++  // Reassemble 34-bit signed displacement.
++  int64_t imm34 = (static_cast<int64_t>(d0) << 16) | d1;
++  imm34 = (imm34 << 30) >> 30;  // sign-extend from bit 33
++
++  // R=1 forms require RA=0 per the ISA.
++  MOZ_ASSERT(!R || ra == 0,
++             "POWER10 prefixed R=1 form requires RA=0");
++
++  // Type 2 = MLS, Type 0 = 8LS. Other types are reserved here.
++  if (type == 2 && suffixOp6 == 14) {
++    // paddi RT, RA, SI, R (MLS)
++    int64_t base = R ? static_cast<int64_t>(prefixAddr)
++                     : (ra == 0 ? 0 : getRegister(ra));
++    setRegister(rt, base + imm34);
++  } else if (type == 0 && suffixOp6 == 57) {
++    // pld RT, D(RA), R (8LS)
++    uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
++                    : (ra == 0 ? 0 : getRegister(ra)) +
++                          static_cast<uint64_t>(imm34);
++    if (!handleWasmSegFault(ea, 8)) {
++      setRegister(rt, readDW(ea, prefix));
++    }
++  } else if (type == 2 && suffixOp6 == 50) {
++    // plfd FRT, D(RA), R (MLS) — load 8-byte double into FPR.
++    uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
++                    : (ra == 0 ? 0 : getRegister(ra)) +
++                          static_cast<uint64_t>(imm34);
++    if (!handleWasmSegFault(ea, 8)) {
++      setFpuRegisterDouble(rt, readD(ea, prefix));
++    }
++  } else if (type == 2 && suffixOp6 == 48) {
++    // plfs FRT, D(RA), R (MLS) — load 4-byte single, widen NaN-preserving.
++    uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
++                    : (ra == 0 ? 0 : getRegister(ra)) +
++                          static_cast<uint64_t>(imm34);
++    if (!handleWasmSegFault(ea, 4)) {
++      float val = *reinterpret_cast<float*>(ea);
++      setFpuRegisterDouble(rt, promoteFloatPreservingNaN(val));
++    }
++  } else if (type == 0 && suffixOp5 == 25) {
++    // plxv XT, D(RA), R (8LS) — XT = (TX << 5) | T, TX at suffix bit 26.
++    int xt = static_cast<int>(rt | (plxvTX << 5));
++    uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
++                    : (ra == 0 ? 0 : getRegister(ra)) +
++                          static_cast<uint64_t>(imm34);
++    if (!handleWasmSegFault(ea, 16)) {
++      uint8_t buf[16];
++      memcpy(buf, reinterpret_cast<const void*>(ea), 16);
++      setVSR128(xt, buf);
++    }
++  } else if (type == 0 && suffixOp6 == 61) {
++    // pstd RS, D(RA), R (8LS) — store doubleword.
++    uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
++                    : (ra == 0 ? 0 : getRegister(ra)) +
++                          static_cast<uint64_t>(imm34);
++    if (!handleWasmSegFault(ea, 8)) {
++      writeDW(ea, getRegister(rt), prefix);
++    }
++  } else if (type == 2 && suffixOp6 == 54) {
++    // pstfd FRS, D(RA), R (MLS) — store double.
++    uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
++                    : (ra == 0 ? 0 : getRegister(ra)) +
++                          static_cast<uint64_t>(imm34);
++    if (!handleWasmSegFault(ea, 8)) {
++      writeD(ea, getFpuRegisterDouble(rt), prefix);
++    }
++  } else if (type == 2 && suffixOp6 == 52) {
++    // pstfs FRS, D(RA), R (MLS) — store single (narrow from double in FPR).
++    uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
++                    : (ra == 0 ? 0 : getRegister(ra)) +
++                          static_cast<uint64_t>(imm34);
++    if (!handleWasmSegFault(ea, 4)) {
++      double dval = getFpuRegisterDouble(rt);
++      *reinterpret_cast<float*>(ea) = demoteDoublePreservingNaN(dval);
++    }
++  } else if (type == 0 && suffixOp5 == 27) {
++    // pstxv XS, D(RA), R (8LS) — XS = (SX << 5) | S, SX at suffix bit 26.
++    int xs = static_cast<int>(rt | (plxvTX << 5));
++    uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
++                    : (ra == 0 ? 0 : getRegister(ra)) +
++                          static_cast<uint64_t>(imm34);
++    if (!handleWasmSegFault(ea, 16)) {
++      uint8_t buf[16];
++      getVSR128(xs, buf);
++      memcpy(reinterpret_cast<void*>(ea), buf, 16);
++    }
++  } else {
++    MOZ_CRASH_UNSAFE_PRINTF(
++        "decodePrefixed: unimplemented type=%u "
++        "(prefix 0x%08x, suffix 0x%08x)",
++        type, prefix->instructionBits(), suffix->instructionBits());
++  }
++
++  // Advance past the full 8-byte prefixed instruction unless a handler
++  // already redirected the PC. The caller (instructionDecode) returns
++  // immediately after us, so its 4-byte trailing advance is skipped.
++  if (!pc_modified_) {
++    set_pc(static_cast<int64_t>(prefixAddr) + 2 * SimInstruction::kInstrSize);
++  }
++}
++
++// =============================================================================
++// Top-level instruction decoder.
++// =============================================================================
++
++void Simulator::instructionDecode(SimInstruction* instr) {
++  if (!SimulatorProcess::ICacheCheckingDisableCount) {
++    AutoLockSimulatorCache als;
++    SimulatorProcess::checkICacheLocked(instr);
++  }
++  pc_modified_ = false;
++
++  uint32_t instrBits = instr->instructionBits();
++
++  // Check for kCallRedirInstr first (PPC_stop = 0x4C0002E4).
++  if (instrBits == kCallRedirInstr) {
++    softwareInterrupt(instr);
++    if (!pc_modified_) {
++      set_pc(reinterpret_cast<int64_t>(instr) + SimInstruction::kInstrSize);
++    }
++    return;
++  }
++
++  // Check for PPC_trap (0x7FE00008).
++  if (instrBits == 0x7FE00008) {
++    softwareInterrupt(instr);
++    if (!pc_modified_) {
++      set_pc(reinterpret_cast<int64_t>(instr) + SimInstruction::kInstrSize);
++    }
++    return;
++  }
++
++  uint32_t opcode = instr->opcode();
++
++  // Power ISA v3.1 prefixed instructions: primary opcode 1 marks a
++  // 4-byte prefix word followed by a 4-byte suffix word. decodePrefixed
++  // advances the PC by the full 8 bytes (or leaves it modified for
++  // PC-relative side-effects).
++  if (opcode == 1) {
++    decodePrefixed(instr);
++    return;
++  }
++
++  switch (opcode) {
++    // D-form ALU
++    case 3:   // twi
++    case 7:   // mulli
++    case 8:   // subfic
++    case 10:  // cmpli
++    case 11:  // cmpi
++    case 12:  // addic
++    case 13:  // addic.
++    case 14:  // addi
++    case 15:  // addis
++    case 24:  // ori
++    case 25:  // oris
++    case 26:  // xori
++    case 27:  // xoris
++    case 28:  // andi.
++    case 29:  // andis.
++      decodeDFormALU(instr);
++      break;
++
++    // D-form loads
++    case 32:  // lwz
++    case 33:  // lwzu
++    case 34:  // lbz
++    case 35:  // lbzu
++    case 40:  // lhz
++    case 41:  // lhzu
++    case 42:  // lha
++    case 43:  // lhau
++    case 48:  // lfs
++    case 49:  // lfsu
++    case 50:  // lfd
++    case 51:  // lfdu
++      decodeDFormLoad(instr);
++      break;
++
++    // D-form stores
++    case 36:  // stw
++    case 38:  // stb
++    case 39:  // stbu
++    case 44:  // sth
++    case 45:  // sthu
++    case 52:  // stfs
++    case 53:  // stfsu
++    case 54:  // stfd
++    case 55:  // stfdu
++      decodeDFormStore(instr);
++      break;
++
++    // DS-form
++    case 58:  // ld, ldu, lwa
++    case 62:  // std, stdu
++      decodeDSForm(instr);
++      break;
++
++    // B-form conditional branch
++    case 16:
++      decodeBranch(instr);
++      break;
++
++    // SC (system call) - unused in JIT
++    case 17:
++      MOZ_CRASH("Simulator: sc instruction not supported");
++      break;
++
++    // I-form unconditional branch
++    case 18:
++      decodeBranch(instr);
++      break;
++
++    // XL-form (branch to LR/CTR, CR operations)
++    case 19:
++      decodeBranch(instr);
++      break;
++
++    // M-form / MD-form rotate/mask
++    case 20:  // rlwimi
++    case 21:  // rlwinm
++    case 23:  // rlwnm
++    case 30:  // rldicl, rldicr, rldic, rldimi, rldcl, rldcr
++      decodeRotateMask(instr);
++      break;
++
++    // VMX (AltiVec) — primary opcode 4. Vector arithmetic / compare / shift /
++    // splat / merge / pack / unpack on VR0-VR31. The wasm SIMD lowering
++    // emits these directly (Simd128 lives in the VR namespace).
++    case 4:
++      decodeVMX(instr);
++      break;
++
++    // X-form / XO-form
++    case 31:
++      decodeXForm(instr);
++      break;
++
++    // FP single (A-form)
++    case 59:
++      decodeFP(instr);
++      break;
++
++    // VSX (XX1-form)
++    case 60:
++      decodeVSX(instr);
++      break;
++
++    // FP double (X-form / A-form)
++    case 63:
++      decodeFP(instr);
++      break;
++
++    default:
++      MOZ_CRASH_UNSAFE_PRINTF(
++          "instructionDecode: unsupported opcode %u (instruction 0x%08x)",
++          opcode, instrBits);
++  }
++
++  if (!pc_modified_) {
++    set_pc(reinterpret_cast<int64_t>(instr) + SimInstruction::kInstrSize);
++  }
++}
++
++// =============================================================================
++// Single-stepping / execute loop.
++// =============================================================================
++
++void Simulator::enable_single_stepping(SingleStepCallback cb, void* arg) {
++  single_stepping_ = true;
++  single_step_callback_ = cb;
++  single_step_callback_arg_ = arg;
++  single_step_callback_(single_step_callback_arg_, this, (void*)get_pc());
++}
++
++void Simulator::disable_single_stepping() {
++  if (!single_stepping_) {
++    return;
++  }
++  single_step_callback_(single_step_callback_arg_, this, (void*)get_pc());
++  single_stepping_ = false;
++  single_step_callback_ = nullptr;
++  single_step_callback_arg_ = nullptr;
++}
++
++template <bool enableStopSimAt>
++void Simulator::execute() {
++  if (single_stepping_ && getenv("PPC64_TRACE_SIM")) {
++    fprintf(stderr, "[sim] enter execute pc=0x%lx lr=0x%lx fp=0x%lx sp=0x%lx\n",
++            (long)get_pc(), (long)getLR(), (long)getRegister(fp),
++            (long)getRegister(sp));
++  }
++  if (single_stepping_) {
++    single_step_callback_(single_step_callback_arg_, this, nullptr);
++  }
++
++  int64_t program_counter = get_pc();
++
++  while (program_counter != end_sim_pc) {
++    if (enableStopSimAt && (icount_ == Simulator::StopSimAt)) {
++      ppc64Debugger dbg(this);
++      dbg.debug();
++    } else {
++      if (single_stepping_) {
++        if (getenv("PPC64_TRACE_SIM")) {
++          fprintf(stderr,
++                  "[sim] step icount=%llu pc=0x%lx instr=0x%08x lr=0x%lx fp=0x%lx sp=0x%lx\n",
++                  (unsigned long long)icount_, (long)program_counter,
++                  *(uint32_t*)program_counter, (long)getLR(),
++                  (long)getRegister(fp), (long)getRegister(sp));
++        }
++        single_step_callback_(single_step_callback_arg_, this,
++                              (void*)program_counter);
++      }
++      SimInstruction* instr =
++          reinterpret_cast<SimInstruction*>(program_counter);
++      instructionDecode(instr);
++      icount_++;
++    }
++    program_counter = get_pc();
++  }
++
++  if (single_stepping_) {
++    single_step_callback_(single_step_callback_arg_, this, nullptr);
++  }
++}
++
++// =============================================================================
++// callInternal / call.
++// =============================================================================
++
++void Simulator::callInternal(uint8_t* entry) {
++  // Prepare to execute the code at entry.
++  setRegister(pc, reinterpret_cast<int64_t>(entry));
++  // The simulation stops when returning to this call point (LR == end_sim_pc).
++  setLR(end_sim_pc);
++
++  // Remember the values of callee-saved registers (r14-r31 in ELFv2).
++  int64_t r14_val = getRegister(r14);
++  int64_t r15_val = getRegister(r15);
++  int64_t r16_val = getRegister(r16);
++  int64_t r17_val = getRegister(r17);
++  int64_t r18_val = getRegister(r18);
++  int64_t r19_val = getRegister(r19);
++  int64_t r20_val = getRegister(r20);
++  int64_t r21_val = getRegister(r21);
++  int64_t r22_val = getRegister(r22);
++  int64_t r23_val = getRegister(r23);
++  int64_t r24_val = getRegister(r24);
++  int64_t r25_val = getRegister(r25);
++  int64_t r26_val = getRegister(r26);
++  int64_t r27_val = getRegister(r27);
++  int64_t r28_val = getRegister(r28);
++  int64_t r29_val = getRegister(r29);
++  int64_t r30_val = getRegister(r30);
++  int64_t r31_val = getRegister(r31);
++  int64_t sp_val = getRegister(sp);
++
++#ifdef DEBUG
++  // Set up callee-saved registers with a known value to detect clobbers.
++  // DEBUG-only: in release this would silently corrupt every JS-jit-entry
++  // stub frame, since the stub saves r14-r31 to its stack early on. Any
++  // single-step-profiling sample taken later (or any unwind through the
++  // stub's saved CSR area) then dereferences `icount_` as a frame
++  // pointer and crashes — see e.g. wasm/profiling.js, ion-error-*.js,
++  // ion-lazy-tables.js, ion-callerfp-tag.js, return-call-profiling.js,
++  // externref-global-postbarrier.js, builtin-modules/i8vecmul.js,
++  // asm.js/testBug1357053.js (all single-step-profiling tests). In
++  // debug builds the value collides with the same callsites but the
++  // MOZ_ASSERTs below catch any actual ABI violation, which is the
++  // entire point.
++  int64_t callee_saved_value = icount_;
++  setRegister(r14, callee_saved_value);
++  setRegister(r15, callee_saved_value);
++  setRegister(r16, callee_saved_value);
++  setRegister(r17, callee_saved_value);
++  setRegister(r18, callee_saved_value);
++  setRegister(r19, callee_saved_value);
++  setRegister(r20, callee_saved_value);
++  setRegister(r21, callee_saved_value);
++  setRegister(r22, callee_saved_value);
++  setRegister(r23, callee_saved_value);
++  setRegister(r24, callee_saved_value);
++  setRegister(r25, callee_saved_value);
++  setRegister(r26, callee_saved_value);
++  setRegister(r27, callee_saved_value);
++  setRegister(r28, callee_saved_value);
++  setRegister(r29, callee_saved_value);
++  setRegister(r30, callee_saved_value);
++  setRegister(r31, callee_saved_value);
++#endif
++
++  // Start the simulation.
++  if (Simulator::StopSimAt != -1) {
++    execute<true>();
++  } else {
++    execute<false>();
++  }
++
++#ifdef DEBUG
++  // Check that the callee-saved registers have been preserved.
++  MOZ_ASSERT(callee_saved_value == getRegister(r14));
++  MOZ_ASSERT(callee_saved_value == getRegister(r15));
++  MOZ_ASSERT(callee_saved_value == getRegister(r16));
++  MOZ_ASSERT(callee_saved_value == getRegister(r17));
++  MOZ_ASSERT(callee_saved_value == getRegister(r18));
++  MOZ_ASSERT(callee_saved_value == getRegister(r19));
++  MOZ_ASSERT(callee_saved_value == getRegister(r20));
++  MOZ_ASSERT(callee_saved_value == getRegister(r21));
++  MOZ_ASSERT(callee_saved_value == getRegister(r22));
++  MOZ_ASSERT(callee_saved_value == getRegister(r23));
++  MOZ_ASSERT(callee_saved_value == getRegister(r24));
++  MOZ_ASSERT(callee_saved_value == getRegister(r25));
++  MOZ_ASSERT(callee_saved_value == getRegister(r26));
++  MOZ_ASSERT(callee_saved_value == getRegister(r27));
++  MOZ_ASSERT(callee_saved_value == getRegister(r28));
++  MOZ_ASSERT(callee_saved_value == getRegister(r29));
++  MOZ_ASSERT(callee_saved_value == getRegister(r30));
++  MOZ_ASSERT(callee_saved_value == getRegister(r31));
++#endif
++
++  // Restore callee-saved registers.
++  setRegister(r14, r14_val);
++  setRegister(r15, r15_val);
++  setRegister(r16, r16_val);
++  setRegister(r17, r17_val);
++  setRegister(r18, r18_val);
++  setRegister(r19, r19_val);
++  setRegister(r20, r20_val);
++  setRegister(r21, r21_val);
++  setRegister(r22, r22_val);
++  setRegister(r23, r23_val);
++  setRegister(r24, r24_val);
++  setRegister(r25, r25_val);
++  setRegister(r26, r26_val);
++  setRegister(r27, r27_val);
++  setRegister(r28, r28_val);
++  setRegister(r29, r29_val);
++  setRegister(r30, r30_val);
++  setRegister(r31, r31_val);
++  setRegister(sp, sp_val);
++}
++
++int64_t Simulator::call(uint8_t* entry, int argument_count, ...) {
++  va_list parameters;
++  va_start(parameters, argument_count);
++
++  int64_t original_stack = getRegister(sp);
++  // Compute position of stack on entry to generated code.
++  int64_t entry_stack = original_stack;
++  if (argument_count > kCArgSlotCount) {
++    entry_stack = entry_stack - argument_count * sizeof(int64_t);
++  } else {
++    entry_stack = entry_stack - kCArgsSlotsSize;
++  }
++
++  entry_stack &= ~U64(ABIStackAlignment - 1);
++
++  intptr_t* stack_argument = reinterpret_cast<intptr_t*>(entry_stack);
++
++  // PPC64 ELFv2: first 8 integer args go in r3-r10.
++  for (int i = 0; i < argument_count; i++) {
++    js::jit::Register argReg;
++    if (GetIntArgReg(i, &argReg)) {
++      setRegister(argReg.code(), va_arg(parameters, int64_t));
++    } else {
++      stack_argument[i] = va_arg(parameters, int64_t);
++    }
++  }
++
++  va_end(parameters);
++  setRegister(sp, entry_stack);
++
++  callInternal(entry);
++
++  MOZ_ASSERT(entry_stack == getRegister(sp));
++  setRegister(sp, original_stack);
++
++  int64_t result = getRegister(r3);
++  return result;
++}
++
++uintptr_t Simulator::pushAddress(uintptr_t address) {
++  int64_t new_sp = getRegister(sp) - sizeof(uintptr_t);
++  uintptr_t* stack_slot = reinterpret_cast<uintptr_t*>(new_sp);
++  *stack_slot = address;
++  setRegister(sp, new_sp);
++  return new_sp;
++}
++
++uintptr_t Simulator::popAddress() {
++  int64_t current_sp = getRegister(sp);
++  uintptr_t* stack_slot = reinterpret_cast<uintptr_t*>(current_sp);
++  uintptr_t address = *stack_slot;
++  setRegister(sp, current_sp + sizeof(uintptr_t));
++  return address;
++}
++
++}  // namespace jit
++}  // namespace js
++
++js::jit::Simulator* JSContext::simulator() const { return simulator_; }
+diff --git a/js/src/jit/ppc64/Simulator-ppc64.h b/js/src/jit/ppc64/Simulator-ppc64.h
+new file mode 100644
+index 000000000000..c7a3f3767d61
+--- /dev/null
++++ b/js/src/jit/ppc64/Simulator-ppc64.h
+@@ -0,0 +1,556 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_Simulator_ppc64_h
++#define jit_ppc64_Simulator_ppc64_h
++
++#ifdef JS_SIMULATOR_PPC64
++
++#  include "mozilla/Atomics.h"
++
++#  include "jit/IonTypes.h"
++#  include "js/ProfilingFrameIterator.h"
++#  include "threading/Thread.h"
++#  include "vm/MutexIDs.h"
++#  include "wasm/WasmSignalHandlers.h"
++
++namespace js {
++namespace jit {
++
++class JitActivation;
++class Simulator;
++class Redirection;
++class CachePage;
++class AutoLockSimulator;
++
++typedef void (*SingleStepCallback)(void* arg, Simulator* sim, void* pc);
++
++const intptr_t kPointerAlignment = 8;
++const intptr_t kPointerAlignmentMask = kPointerAlignment - 1;
++const intptr_t kDoubleAlignment = 8;
++const intptr_t kDoubleAlignmentMask = kDoubleAlignment - 1;
++
++const int kNumGPRegisters = 32;
++const int kPCRegister = 32;
++const int kNumFPURegisters = 32;
++const int kNumVRRegisters = 32;  // VR0-VR31 (Altivec/VMX; = VSR32-63 in VSX)
++
++// PPC64 Condition Register: 8 fields of 4 bits each.
++// Each field: bit3=LT, bit2=GT, bit1=EQ, bit0=SO (in PPC big-endian numbering
++// within a field, but stored in little-endian nibble order in our uint32_t).
++const int kNumCRFields = 8;
++
++// CR field bit positions (within a 4-bit field).
++const uint8_t kCRFieldLT = 0x8;
++const uint8_t kCRFieldGT = 0x4;
++const uint8_t kCRFieldEQ = 0x2;
++const uint8_t kCRFieldSO = 0x1;
++
++// XER register bit positions.
++const int kXERSOBit = 31;
++const int kXEROVBit = 30;
++const int kXERCABit = 29;
++const int kXEROV32Bit = 19;
++const int kXERCA32Bit = 18;
++
++// FPSCR rounding mode bits (bits 62:63, stored in low bits of our uint64_t).
++const uint64_t kFPSCRRNMask = 0x3;
++
++// FPU rounding modes matching PPC64 FPSCR RN field.
++enum FPURoundingMode {
++  RN = 0,  // Round to Nearest (ties to even)
++  RZ = 1,  // Round toward Zero
++  RP = 2,  // Round toward +Infinity
++  RM = 3,  // Round toward -Infinity
++};
++
++// FPU invalid result constants.
++const uint32_t kFPUInvalidResult = static_cast<uint32_t>(1 << 31) - 1;
++const int32_t kFPUInvalidResultNegative = static_cast<int32_t>(1u << 31);
++const uint64_t kFPU64InvalidResult =
++    static_cast<uint64_t>(static_cast<uint64_t>(1) << 63) - 1;
++const int64_t kFPU64InvalidResultNegative =
++    static_cast<int64_t>(static_cast<uint64_t>(1) << 63);
++
++// Breakpoint/stop code ranges.
++const uint32_t kMaxWatchpointCode = 31;
++const uint32_t kMaxStopCode = 127;
++const uint32_t kWasmTrapCode = 6;
++
++// Redirection instruction: PPC_stop (0x4C0002E4).
++// Distinct from PPC_trap (0x7FE00008) used for wasm traps.
++const uint32_t kCallRedirInstr = 0x4C0002E4;
++
++typedef uint32_t Instr;
++class SimInstruction;
++
++class Simulator {
++  friend class ppc64Debugger;
++
++ public:
++  enum Register {
++    no_reg = -1,
++    r0 = 0,
++    r1,
++    r2,
++    r3,
++    r4,
++    r5,
++    r6,
++    r7,
++    r8,
++    r9,
++    r10,
++    r11,
++    r12,
++    r13,
++    r14,
++    r15,
++    r16,
++    r17,
++    r18,
++    r19,
++    r20,
++    r21,
++    r22,
++    r23,
++    r24,
++    r25,
++    r26,
++    r27,
++    r28,
++    r29,
++    r30,
++    r31,
++    pc,
++    kNumSimuRegisters,
++    // Aliases
++    sp = r1,
++    fp = r31,
++  };
++
++  enum FPURegister {
++    f0 = 0,
++    f1,
++    f2,
++    f3,
++    f4,
++    f5,
++    f6,
++    f7,
++    f8,
++    f9,
++    f10,
++    f11,
++    f12,
++    f13,
++    f14,
++    f15,
++    f16,
++    f17,
++    f18,
++    f19,
++    f20,
++    f21,
++    f22,
++    f23,
++    f24,
++    f25,
++    f26,
++    f27,
++    f28,
++    f29,
++    f30,
++    f31,
++    kNumFPURegisters
++  };
++
++  static Simulator* Create();
++  static void Destroy(Simulator* simulator);
++
++  Simulator();
++  ~Simulator();
++
++  static Simulator* Current();
++
++  static inline uintptr_t StackLimit() {
++    return Simulator::Current()->stackLimit();
++  }
++
++  uintptr_t* addressOfStackLimit();
++
++  // GPR accessors.
++  void setRegister(int reg, int64_t value);
++  int64_t getRegister(int reg) const;
++
++  // FPR accessors.
++  void setFpuRegister(int fpureg, int64_t value);
++  void setFpuRegisterWord(int fpureg, int32_t value);
++  void setFpuRegisterFloat(int fpureg, float value);
++  void setFpuRegisterDouble(int fpureg, double value);
++  int64_t getFpuRegister(int fpureg) const;
++  int32_t getFpuRegisterWord(int fpureg) const;
++  int32_t getFpuRegisterSignedWord(int fpureg) const;
++  float getFpuRegisterFloat(int fpureg) const;
++  double getFpuRegisterDouble(int fpureg) const;
++
++  // VR accessors (Altivec/VMX registers VR0-VR31). The bytes array is the
++  // ground truth: bytes[0] is the most-significant-byte on PPC64 big-endian
++  // numbering, i.e., VSR[MSB..LSB] mapped as bytes[0..15]. Callers that want
++  // typed views (lane 0 etc.) should extract from the bytes array according
++  // to the ISA's lane numbering for that instruction.
++  void setVRBytes(int vreg, const uint8_t bytes[16]);
++  void getVRBytes(int vreg, uint8_t bytes[16]) const;
++
++  // VSR (Vector-Scalar Register) accessors: unified 64-register namespace
++  // where VSR 0-31 aliases FPR 0-31 (DW0 is the FPR value, DW1 is
++  // architecturally undefined — we model it as zero on read, ignored on
++  // write) and VSR 32-63 aliases VR 0-31. Used by VSX instructions
++  // (xxpermdi, xxlor, xxlxor, mtvsrd, mfvsrd, ...).
++  void getVSR128(int vsr, uint8_t bytes[16]) const;
++  void setVSR128(int vsr, const uint8_t bytes[16]);
++
++  // SPR accessors.
++  int64_t getLR() const { return LR_; }
++  void setLR(int64_t value) { LR_ = value; }
++  int64_t getCTR() const { return CTR_; }
++  void setCTR(int64_t value) { CTR_ = value; }
++  uint32_t getCR() const { return CR_; }
++  void setCR(uint32_t value) { CR_ = value; }
++  uint64_t getXER() const { return XER_; }
++  void setXER(uint64_t value) { XER_ = value; }
++  uint64_t getFPSCR() const { return FPSCR_; }
++  void setFPSCR(uint64_t value) { FPSCR_ = value; }
++
++  // CR field accessors: field 0 is the most significant nibble (bits 31:28).
++  uint8_t getCRField(int field) const {
++    return (CR_ >> (4 * (7 - field))) & 0xF;
++  }
++  void setCRField(int field, uint8_t val) {
++    uint32_t shift = 4 * (7 - field);
++    CR_ = (CR_ & ~(0xFu << shift)) | ((val & 0xFu) << shift);
++  }
++
++  // XER bit accessors.
++  bool getXERSO() const { return (XER_ >> kXERSOBit) & 1; }
++  void setXERSO(bool v) {
++    XER_ = (XER_ & ~(1ull << kXERSOBit)) | ((uint64_t)v << kXERSOBit);
++  }
++  bool getXEROV() const { return (XER_ >> kXEROVBit) & 1; }
++  void setXEROV(bool v) {
++    XER_ = (XER_ & ~(1ull << kXEROVBit)) | ((uint64_t)v << kXEROVBit);
++    // Mirror to OV32. Real POWER9 silicon sets OV32 == OV for both 32-bit
++    // and 64-bit overflow ops: mulldo(2, 2^62) produces OV=OV32=1;
++    // mulldo(2^30, 4) produces OV=OV32=0. The JIT's
++    // POWER9 Overflow path is `mulldo + mcrxrx + bc Overflow`, where
++    // mcrxrx places OV32 in the GT slot and the Overflow condition tests
++    // GT — so OV32 must be live or no-overflow is reported even when
++    // OV=1. Without this mirror, BigInt fast-path mul silently wraps.
++    XER_ = (XER_ & ~(1ull << kXEROV32Bit)) | ((uint64_t)v << kXEROV32Bit);
++    if (v) setXERSO(true);
++  }
++  bool getXERCA() const { return (XER_ >> kXERCABit) & 1; }
++  void setXERCA(bool v) {
++    XER_ = (XER_ & ~(1ull << kXERCABit)) | ((uint64_t)v << kXERCABit);
++  }
++
++  // PC accessors.
++  void set_pc(int64_t value);
++  int64_t get_pc() const;
++
++  template <typename T>
++  T get_pc_as() const {
++    return reinterpret_cast<T>(get_pc());
++  }
++
++  void enable_single_stepping(SingleStepCallback cb, void* arg);
++  void disable_single_stepping();
++
++  uintptr_t stackLimit() const;
++  bool overRecursed(uintptr_t newsp = 0) const;
++  bool overRecursedWithExtra(uint32_t extra) const;
++
++  template <bool enableStopSimAt>
++  void execute();
++
++  int64_t call(uint8_t* entry, int argument_count, ...);
++
++  uintptr_t pushAddress(uintptr_t address);
++  uintptr_t popAddress();
++
++  void setLastDebuggerInput(char* input);
++  char* lastDebuggerInput() { return lastDebuggerInput_; }
++
++  bool has_bad_pc() const;
++
++  // Update CR field 0 from a 64-bit result.
++  void updateCR0(int64_t result) {
++    uint8_t field = kCRFieldSO * getXERSO();
++    if (result < 0)
++      field |= kCRFieldLT;
++    else if (result > 0)
++      field |= kCRFieldGT;
++    else
++      field |= kCRFieldEQ;
++    setCRField(0, field);
++  }
++
++  // Update CR field 0 from a 32-bit result (sign-extended comparison).
++  void updateCR0_32(int32_t result) {
++    uint8_t field = kCRFieldSO * getXERSO();
++    if (result < 0)
++      field |= kCRFieldLT;
++    else if (result > 0)
++      field |= kCRFieldGT;
++    else
++      field |= kCRFieldEQ;
++    setCRField(0, field);
++  }
++
++  // Compare and set an arbitrary CR field.
++  void setCRFieldCmp(int field, int64_t lhs, int64_t rhs) {
++    uint8_t val = kCRFieldSO * getXERSO();
++    if (lhs < rhs)
++      val |= kCRFieldLT;
++    else if (lhs > rhs)
++      val |= kCRFieldGT;
++    else
++      val |= kCRFieldEQ;
++    setCRField(field, val);
++  }
++
++  void setCRFieldCmpU(int field, uint64_t lhs, uint64_t rhs) {
++    uint8_t val = kCRFieldSO * getXERSO();
++    if (lhs < rhs)
++      val |= kCRFieldLT;
++    else if (lhs > rhs)
++      val |= kCRFieldGT;
++    else
++      val |= kCRFieldEQ;
++    setCRField(field, val);
++  }
++
++ private:
++  enum SpecialValues {
++    // PPC64 masks the low 2 bits of branch targets, so these must be
++    // 4-byte aligned to survive the & ~3 mask in blr/bcctr.
++    bad_ra = -4,
++    end_sim_pc = -8,
++    Unpredictable = 0xbadbeaf
++  };
++
++  bool init();
++
++  void format(SimInstruction* instr, const char* format);
++
++  // Memory access.
++  inline uint8_t readBU(uint64_t addr);
++  inline int8_t readB(uint64_t addr);
++  inline void writeB(uint64_t addr, uint8_t value);
++  inline void writeB(uint64_t addr, int8_t value);
++
++  inline uint16_t readHU(uint64_t addr, SimInstruction* instr);
++  inline int16_t readH(uint64_t addr, SimInstruction* instr);
++  inline void writeH(uint64_t addr, uint16_t value, SimInstruction* instr);
++  inline void writeH(uint64_t addr, int16_t value, SimInstruction* instr);
++
++  inline uint32_t readWU(uint64_t addr, SimInstruction* instr);
++  inline int32_t readW(uint64_t addr, SimInstruction* instr);
++  inline void writeW(uint64_t addr, uint32_t value, SimInstruction* instr);
++  inline void writeW(uint64_t addr, int32_t value, SimInstruction* instr);
++
++  inline int64_t readDW(uint64_t addr, SimInstruction* instr);
++  inline void writeDW(uint64_t addr, int64_t value, SimInstruction* instr);
++
++  inline double readD(uint64_t addr, SimInstruction* instr);
++  inline void writeD(uint64_t addr, double value, SimInstruction* instr);
++
++  inline uint8_t loadLinkedB(uint64_t addr, SimInstruction* instr);
++  inline int storeConditionalB(uint64_t addr, uint8_t value,
++                               SimInstruction* instr);
++  inline uint16_t loadLinkedH(uint64_t addr, SimInstruction* instr);
++  inline int storeConditionalH(uint64_t addr, uint16_t value,
++                               SimInstruction* instr);
++  inline int32_t loadLinkedW(uint64_t addr, SimInstruction* instr);
++  inline int storeConditionalW(uint64_t addr, int32_t value,
++                               SimInstruction* instr);
++  inline int64_t loadLinkedD(uint64_t addr, SimInstruction* instr);
++  inline int storeConditionalD(uint64_t addr, int64_t value,
++                               SimInstruction* instr);
++
++  // Instruction decoders.
++  void decodeDFormALU(SimInstruction* instr);
++  void decodeDFormLoad(SimInstruction* instr);
++  void decodeDFormStore(SimInstruction* instr);
++  void decodeDSForm(SimInstruction* instr);
++  void decodeXForm(SimInstruction* instr);
++  void decodeRotateMask(SimInstruction* instr);
++  void decodeBranch(SimInstruction* instr);
++  void decodeFP(SimInstruction* instr);
++  void decodeVSX(SimInstruction* instr);
++  void decodeVMX(SimInstruction* instr);
++  // Power ISA v3.1 prefixed instructions. `prefix` points at the
++  // 4-byte prefix word; the suffix is read from `prefix + 4`.
++  void decodePrefixed(SimInstruction* prefix);
++
++  void softwareInterrupt(SimInstruction* instr);
++
++  // Stop/breakpoint helpers.
++  bool isWatchpoint(uint32_t code);
++  void printWatchpoint(uint32_t code);
++  void handleStop(uint32_t code, SimInstruction* instr);
++  bool isStopInstruction(SimInstruction* instr);
++  bool isEnabledStop(uint32_t code);
++  void enableStop(uint32_t code);
++  void disableStop(uint32_t code);
++  void increaseStopCounter(uint32_t code);
++  void printStopInfo(uint32_t code);
++
++  JS::ProfilingFrameIterator::RegisterState registerState();
++
++  bool MOZ_ALWAYS_INLINE handleWasmSegFault(uint64_t addr, unsigned numBytes) {
++    if (MOZ_LIKELY(!js::wasm::CodeExists)) {
++      return false;
++    }
++    uint8_t* newPC;
++    if (!js::wasm::MemoryAccessTraps(registerState(), (uint8_t*)addr, numBytes,
++                                     &newPC)) {
++      return false;
++    }
++    LLBit_ = false;
++    set_pc(int64_t(newPC));
++    return true;
++  }
++
++  void instructionDecode(SimInstruction* instr);
++
++ public:
++  static int64_t StopSimAt;
++
++  static void* RedirectNativeFunction(void* nativeFunction,
++                                      ABIFunctionType type);
++
++ private:
++  void setCallResultDouble(double result);
++  void setCallResultFloat(float result);
++  void setCallResult(int64_t res);
++#  ifdef XP_DARWIN
++  void setCallResult(intptr_t res);
++#  endif
++  void setCallResult(__int128 res);
++
++  void callInternal(uint8_t* entry);
++
++  // Architecture state.
++  int64_t registers_[kNumSimuRegisters];
++  int64_t FPUregisters_[kNumFPURegisters];
++  // VR namespace (Altivec/VMX registers VR0-VR31 == VSR32-63). Stored as
++  // 16 raw bytes per register to preserve exact architectural byte order
++  // independent of host endianness. Accessors defined below; the bytes
++  // array is the ground truth.
++  uint8_t VRregisters_[kNumVRRegisters][16];
++
++  // PPC64 Special Purpose Registers.
++  int64_t LR_;
++  int64_t CTR_;
++  uint32_t CR_;
++  uint64_t XER_;
++  uint64_t FPSCR_;
++
++  // Atomics.
++  bool LLBit_;
++  uintptr_t LLAddr_;
++  int64_t lastLLValue_;
++
++  // Simulator support.
++  char* stack_;
++  uintptr_t stackLimit_;
++  bool pc_modified_;
++  int64_t icount_;
++  int64_t break_count_;
++
++  char* lastDebuggerInput_;
++
++  SimInstruction* break_pc_;
++  Instr break_instr_;
++
++  bool single_stepping_;
++  SingleStepCallback single_step_callback_;
++  void* single_step_callback_arg_;
++
++  static const uint32_t kNumOfWatchedStops = 256;
++  static const uint32_t kStopDisabledBit = 1U << 31;
++
++  struct StopCountAndDesc {
++    uint32_t count_;
++    char* desc_;
++  };
++  StopCountAndDesc watchedStops_[kNumOfWatchedStops];
++};
++
++// Process-wide simulator state.
++class SimulatorProcess {
++  friend class Redirection;
++  friend class AutoLockSimulatorCache;
++
++ private:
++  struct ICacheHasher {
++    typedef void* Key;
++    typedef void* Lookup;
++    static HashNumber hash(const Lookup& l);
++    static bool match(const Key& k, const Lookup& l);
++  };
++
++ public:
++  typedef HashMap<void*, CachePage*, ICacheHasher, SystemAllocPolicy> ICacheMap;
++
++  static mozilla::Atomic<size_t, mozilla::ReleaseAcquire>
++      ICacheCheckingDisableCount;
++  static void FlushICache(void* start, size_t size);
++  static void checkICacheLocked(SimInstruction* instr);
++
++  static bool initialize() {
++    singleton_ = js_new<SimulatorProcess>();
++    return singleton_;
++  }
++  static void destroy() {
++    js_delete(singleton_);
++    singleton_ = nullptr;
++  }
++
++  SimulatorProcess();
++  ~SimulatorProcess();
++
++ private:
++  static SimulatorProcess* singleton_;
++
++  Mutex cacheLock_;
++  Redirection* redirection_;
++  ICacheMap icache_;
++
++ public:
++  static ICacheMap& icache() {
++    singleton_->cacheLock_.assertOwnedByCurrentThread();
++    return singleton_->icache_;
++  }
++
++  static Redirection* redirection() {
++    singleton_->cacheLock_.assertOwnedByCurrentThread();
++    return singleton_->redirection_;
++  }
++
++  static void setRedirection(js::jit::Redirection* redirection) {
++    singleton_->cacheLock_.assertOwnedByCurrentThread();
++    singleton_->redirection_ = redirection;
++  }
++};
++
++}  // namespace jit
++}  // namespace js
++
++#endif /* JS_SIMULATOR_PPC64 */
++
++#endif /* jit_ppc64_Simulator_ppc64_h */
+diff --git a/js/src/jit/ppc64/Trampoline-ppc64.cpp b/js/src/jit/ppc64/Trampoline-ppc64.cpp
+new file mode 100644
+index 000000000000..515a931c86b0
+--- /dev/null
++++ b/js/src/jit/ppc64/Trampoline-ppc64.cpp
+@@ -0,0 +1,648 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#include "jit/Bailouts.h"
++#include "jit/BaselineFrame.h"
++#include "jit/CalleeToken.h"
++#include "jit/JitFrames.h"
++#include "jit/JitRuntime.h"
++#include "jit/PerfSpewer.h"
++#include "jit/ppc64/SharedICHelpers-ppc64.h"
++#include "jit/VMFunctions.h"
++#include "vm/JitActivation.h"
++#include "vm/JSContext.h"
++
++#include "jit/MacroAssembler-inl.h"
++
++using namespace js;
++using namespace js::jit;
++
++// Float (Single+Double) and all GPRs. Simd128 excluded — Ion compiles JS
++// (no v128 type), so SIMD regs are never live at bailout / invalidator /
++// preBarrier entry. Including them would force the bailout frame's
++// FPUArray to hold v128 slots that Ion never writes.
++static const LiveRegisterSet AllRegs = LiveRegisterSet(
++    GeneralRegisterSet(Registers::AllMask),
++    FloatRegisterSet(FloatRegisters::AllSingleMask |
++                     FloatRegisters::AllDoubleMask));
++
++static_assert(sizeof(uintptr_t) == sizeof(uint64_t), "Not 64-bit clean.");
++
++// PPC64 ELFv2 callee-saved: GPRs r14-r31, FPRs f14-f31, VRs VR20-VR31, LR.
++// We also save reg_vp (r10 / IntArgReg7) so we can use it after the JIT call.
++//
++// Layout is alignas(16) so that after `reserveStack(sizeof(EnterJITRegs))`
++// the SP-relative offset of every VR slot is 16-byte aligned, satisfying
++// the 16-byte alignment requirement of stxvd2x / stvx (stvx is technically
++// alignment-tolerant, but we'd rather align by construction). Padding at
++// the end keeps sizeof a multiple of 16 so SP stays quadword-aligned per
++// the ELFv2 stack-pointer rule.
++struct alignas(16) EnterJITRegs {
++  // VR20-VR31 first so their SP-relative offsets are 0, 16, 32, ... — all
++  // 16-byte aligned regardless of what follows.
++  uint8_t vr20[16];
++  uint8_t vr21[16];
++  uint8_t vr22[16];
++  uint8_t vr23[16];
++  uint8_t vr24[16];
++  uint8_t vr25[16];
++  uint8_t vr26[16];
++  uint8_t vr27[16];
++  uint8_t vr28[16];
++  uint8_t vr29[16];
++  uint8_t vr30[16];
++  uint8_t vr31[16];
++
++  double f31;
++  double f30;
++  double f29;
++  double f28;
++  double f27;
++  double f26;
++  double f25;
++  double f24;
++  double f23;
++  double f22;
++  double f21;
++  double f20;
++  double f19;
++  double f18;
++  double f17;
++  double f16;
++  double f15;
++  double f14;
++
++  uint64_t r31;  // FramePointer
++  uint64_t r30;
++  uint64_t r29;
++  uint64_t r28;
++  uint64_t r27;
++  uint64_t r26;
++  uint64_t r25;
++  uint64_t r24;
++  uint64_t r23;
++  uint64_t r22;
++  uint64_t r21;
++  uint64_t r20;
++  uint64_t r19;
++  uint64_t r18;
++  uint64_t r17;
++  uint64_t r16;
++  uint64_t r15;
++  uint64_t r14;
++  uint64_t r2;  // TOC pointer
++  uint64_t lr;
++  // Save reg_vp (r10) on stack so we can use it after the JIT call returns.
++  uint64_t r10;
++};
++// alignas(16) on the struct ensures sizeof is a multiple of 16, which keeps
++// SP quadword-aligned after `reserveStack(sizeof(EnterJITRegs))`. The
++// existing fields total 312 bytes; with the 192 bytes of VR slots we are
++// at 504, which alignas(16) bumps to 512.
++static_assert((sizeof(EnterJITRegs) % 16) == 0,
++              "EnterJITRegs must be 16-byte aligned to keep SP aligned");
++
++static void GenerateReturn(MacroAssembler& masm) {
++  MOZ_ASSERT(masm.framePushed() == sizeof(EnterJITRegs));
++
++  // Restore non-volatile GPRs.
++  masm.as_ld(r14, StackPointer, offsetof(EnterJITRegs, r14));
++  masm.as_ld(r15, StackPointer, offsetof(EnterJITRegs, r15));
++  masm.as_ld(r16, StackPointer, offsetof(EnterJITRegs, r16));
++  masm.as_ld(r17, StackPointer, offsetof(EnterJITRegs, r17));
++  masm.as_ld(r18, StackPointer, offsetof(EnterJITRegs, r18));
++  masm.as_ld(r19, StackPointer, offsetof(EnterJITRegs, r19));
++  masm.as_ld(r20, StackPointer, offsetof(EnterJITRegs, r20));
++  masm.as_ld(r21, StackPointer, offsetof(EnterJITRegs, r21));
++  masm.as_ld(r22, StackPointer, offsetof(EnterJITRegs, r22));
++  masm.as_ld(r23, StackPointer, offsetof(EnterJITRegs, r23));
++  masm.as_ld(r24, StackPointer, offsetof(EnterJITRegs, r24));
++  masm.as_ld(r25, StackPointer, offsetof(EnterJITRegs, r25));
++  masm.as_ld(r26, StackPointer, offsetof(EnterJITRegs, r26));
++  masm.as_ld(r27, StackPointer, offsetof(EnterJITRegs, r27));
++  masm.as_ld(r28, StackPointer, offsetof(EnterJITRegs, r28));
++  masm.as_ld(r29, StackPointer, offsetof(EnterJITRegs, r29));
++  masm.as_ld(r30, StackPointer, offsetof(EnterJITRegs, r30));
++  masm.as_ld(r31, StackPointer, offsetof(EnterJITRegs, r31));
++  masm.as_ld(r2, StackPointer, offsetof(EnterJITRegs, r2));
++
++  // Restore LR.
++  masm.as_ld(r0, StackPointer, offsetof(EnterJITRegs, lr));
++  masm.xs_mtlr(r0);
++
++  // Restore non-volatile FPRs.
++  masm.as_lfd(f14, StackPointer, offsetof(EnterJITRegs, f14));
++  masm.as_lfd(f15, StackPointer, offsetof(EnterJITRegs, f15));
++  masm.as_lfd(f16, StackPointer, offsetof(EnterJITRegs, f16));
++  masm.as_lfd(f17, StackPointer, offsetof(EnterJITRegs, f17));
++  masm.as_lfd(f18, StackPointer, offsetof(EnterJITRegs, f18));
++  masm.as_lfd(f19, StackPointer, offsetof(EnterJITRegs, f19));
++  masm.as_lfd(f20, StackPointer, offsetof(EnterJITRegs, f20));
++  masm.as_lfd(f21, StackPointer, offsetof(EnterJITRegs, f21));
++  masm.as_lfd(f22, StackPointer, offsetof(EnterJITRegs, f22));
++  masm.as_lfd(f23, StackPointer, offsetof(EnterJITRegs, f23));
++  masm.as_lfd(f24, StackPointer, offsetof(EnterJITRegs, f24));
++  masm.as_lfd(f25, StackPointer, offsetof(EnterJITRegs, f25));
++  masm.as_lfd(f26, StackPointer, offsetof(EnterJITRegs, f26));
++  masm.as_lfd(f27, StackPointer, offsetof(EnterJITRegs, f27));
++  masm.as_lfd(f28, StackPointer, offsetof(EnterJITRegs, f28));
++  masm.as_lfd(f29, StackPointer, offsetof(EnterJITRegs, f29));
++  masm.as_lfd(f30, StackPointer, offsetof(EnterJITRegs, f30));
++  masm.as_lfd(f31, StackPointer, offsetof(EnterJITRegs, f31));
++
++  // Restore callee-saved VR20-VR31 (ELFv2). lvx uses indexed addressing
++  // (RA + RB), and r0's value is used here as RB (RA = StackPointer is
++  // non-zero, so its value is added). r0 is non-allocatable.
++#define RESTORE_VR(N)                                                 \
++  masm.xs_li(r0, offsetof(EnterJITRegs, vr##N));                      \
++  masm.as_lvx(N, StackPointer, r0)
++  RESTORE_VR(20); RESTORE_VR(21); RESTORE_VR(22); RESTORE_VR(23);
++  RESTORE_VR(24); RESTORE_VR(25); RESTORE_VR(26); RESTORE_VR(27);
++  RESTORE_VR(28); RESTORE_VR(29); RESTORE_VR(30); RESTORE_VR(31);
++#undef RESTORE_VR
++
++  masm.freeStack(sizeof(EnterJITRegs));
++
++  masm.as_blr();
++}
++
++static void GeneratePrologue(MacroAssembler& masm) {
++  // Save LR first (PPC64 LR is SPR, not GPR).
++  masm.xs_mflr(r0);
++
++  // ELFv2 prologue convention: save LR at caller's frame [SP+16] BEFORE
++  // decrementing SP. External unwinders (gdb, perf, libunwind) walk the
++  // stack by reading LR-save slots at [SP+16] of every frame; without
++  // this write they'd find junk at our caller's slot. Costs 1 extra
++  // instruction; we still keep the in-frame save below for clean
++  // restore symmetry.
++  masm.as_std(r0, StackPointer, 16);
++
++  masm.reserveStack(sizeof(EnterJITRegs));
++
++  // Save LR (also kept in our own frame for the clean restore in
++  // GenerateReturn — see comment there).
++  masm.as_std(r0, StackPointer, offsetof(EnterJITRegs, lr));
++
++  // Save non-volatile GPRs.
++  masm.as_std(r2, StackPointer, offsetof(EnterJITRegs, r2));
++  masm.as_std(r14, StackPointer, offsetof(EnterJITRegs, r14));
++  masm.as_std(r15, StackPointer, offsetof(EnterJITRegs, r15));
++  masm.as_std(r16, StackPointer, offsetof(EnterJITRegs, r16));
++  masm.as_std(r17, StackPointer, offsetof(EnterJITRegs, r17));
++  masm.as_std(r18, StackPointer, offsetof(EnterJITRegs, r18));
++  masm.as_std(r19, StackPointer, offsetof(EnterJITRegs, r19));
++  masm.as_std(r20, StackPointer, offsetof(EnterJITRegs, r20));
++  masm.as_std(r21, StackPointer, offsetof(EnterJITRegs, r21));
++  masm.as_std(r22, StackPointer, offsetof(EnterJITRegs, r22));
++  masm.as_std(r23, StackPointer, offsetof(EnterJITRegs, r23));
++  masm.as_std(r24, StackPointer, offsetof(EnterJITRegs, r24));
++  masm.as_std(r25, StackPointer, offsetof(EnterJITRegs, r25));
++  masm.as_std(r26, StackPointer, offsetof(EnterJITRegs, r26));
++  masm.as_std(r27, StackPointer, offsetof(EnterJITRegs, r27));
++  masm.as_std(r28, StackPointer, offsetof(EnterJITRegs, r28));
++  masm.as_std(r29, StackPointer, offsetof(EnterJITRegs, r29));
++  masm.as_std(r30, StackPointer, offsetof(EnterJITRegs, r30));
++  masm.as_std(r31, StackPointer, offsetof(EnterJITRegs, r31));
++
++  // Save reg_vp (r10) so we can retrieve it after the JIT call.
++  masm.as_std(r10, StackPointer, offsetof(EnterJITRegs, r10));
++
++  // Save non-volatile FPRs.
++  masm.as_stfd(f14, StackPointer, offsetof(EnterJITRegs, f14));
++  masm.as_stfd(f15, StackPointer, offsetof(EnterJITRegs, f15));
++  masm.as_stfd(f16, StackPointer, offsetof(EnterJITRegs, f16));
++  masm.as_stfd(f17, StackPointer, offsetof(EnterJITRegs, f17));
++  masm.as_stfd(f18, StackPointer, offsetof(EnterJITRegs, f18));
++  masm.as_stfd(f19, StackPointer, offsetof(EnterJITRegs, f19));
++  masm.as_stfd(f20, StackPointer, offsetof(EnterJITRegs, f20));
++  masm.as_stfd(f21, StackPointer, offsetof(EnterJITRegs, f21));
++  masm.as_stfd(f22, StackPointer, offsetof(EnterJITRegs, f22));
++  masm.as_stfd(f23, StackPointer, offsetof(EnterJITRegs, f23));
++  masm.as_stfd(f24, StackPointer, offsetof(EnterJITRegs, f24));
++  masm.as_stfd(f25, StackPointer, offsetof(EnterJITRegs, f25));
++  masm.as_stfd(f26, StackPointer, offsetof(EnterJITRegs, f26));
++  masm.as_stfd(f27, StackPointer, offsetof(EnterJITRegs, f27));
++  masm.as_stfd(f28, StackPointer, offsetof(EnterJITRegs, f28));
++  masm.as_stfd(f29, StackPointer, offsetof(EnterJITRegs, f29));
++  masm.as_stfd(f30, StackPointer, offsetof(EnterJITRegs, f30));
++  masm.as_stfd(f31, StackPointer, offsetof(EnterJITRegs, f31));
++
++  // Save callee-saved VR20-VR31 (ELFv2). The JIT freely uses VMX registers
++  // via EmitVmxBinary etc.; without this save the C caller's VR20-VR31
++  // contents would be trashed on return. stvx uses indexed addressing —
++  // r0 holds the offset (non-allocatable in JIT regalloc; safe to use as
++  // a free temp here).
++#define SAVE_VR(N)                                                    \
++  masm.xs_li(r0, offsetof(EnterJITRegs, vr##N));                      \
++  masm.as_stvx(N, StackPointer, r0)
++  SAVE_VR(20); SAVE_VR(21); SAVE_VR(22); SAVE_VR(23);
++  SAVE_VR(24); SAVE_VR(25); SAVE_VR(26); SAVE_VR(27);
++  SAVE_VR(28); SAVE_VR(29); SAVE_VR(30); SAVE_VR(31);
++#undef SAVE_VR
++}
++
++void JitRuntime::generateEnterJIT(JSContext* cx, MacroAssembler& masm) {
++  AutoCreatedBy acb(masm, "JitRuntime::generateEnterJIT");
++
++  enterJITOffset_ = startTrampolineCode(masm);
++
++  // EnterJitCode signature: (void* code, unsigned argc, Value* argv,
++  //                          InterpreterFrame* fp, CalleeToken calleeToken,
++  //                          JSObject* envChain, size_t numStackValues,
++  //                          Value* vp)
++  const Register reg_code = IntArgReg0;                       // r3
++  const Register reg_argc = IntArgReg1;                       // r4
++  const Register reg_argv = IntArgReg2;                       // r5
++  const mozilla::DebugOnly<Register> reg_frame = IntArgReg3;  // r6
++  const Register reg_token = IntArgReg4;                      // r7
++  const Register reg_chain = IntArgReg5;                      // r8
++  const Register reg_values = IntArgReg6;                     // r9
++  const Register reg_vp = IntArgReg7;                         // r10
++
++  MOZ_ASSERT(OsrFrameReg == reg_frame);
++
++  GeneratePrologue(masm);
++
++  // Save stack pointer as baseline frame.
++  masm.movePtr(StackPointer, FramePointer);
++
++  // Use non-volatile scratch registers for generateEnterJitShared.
++  // r14, r15, r17 are non-volatile and not special-purpose in JIT.
++  generateEnterJitShared(masm, reg_argc, reg_argv, reg_token, r14, r15, r17);
++
++  // Push the descriptor.
++  masm.unboxInt32(Address(reg_vp, 0), r14);
++  masm.pushFrameDescriptorForJitCall(FrameType::CppToJSJit, r14, r14);
++
++  CodeLabel returnLabel;
++  Label oomReturnLabel;
++  {
++    // Handle Interpreter -> Baseline OSR.
++    AllocatableGeneralRegisterSet regs(GeneralRegisterSet::All());
++    MOZ_ASSERT(!regs.has(FramePointer));
++    regs.take(OsrFrameReg);
++    regs.take(reg_code);
++    MOZ_ASSERT(!regs.has(ReturnReg), "ReturnReg matches reg_code");
++
++    Label notOsr;
++    masm.branchTestPtr(Assembler::Zero, OsrFrameReg, OsrFrameReg, &notOsr);
++
++    Register numStackValues = reg_values;
++    regs.take(numStackValues);
++    Register scratch = regs.takeAny();
++
++    // Push return address.
++    masm.subPtr(Imm32(sizeof(uintptr_t)), StackPointer);
++    masm.mov(&returnLabel, scratch);
++    masm.storePtr(scratch, Address(StackPointer, 0));
++
++    // Push previous frame pointer.
++    masm.subPtr(Imm32(sizeof(uintptr_t)), StackPointer);
++    masm.storePtr(FramePointer, Address(StackPointer, 0));
++
++    // Reserve frame.
++    Register framePtr = FramePointer;
++    masm.movePtr(StackPointer, framePtr);
++    masm.subPtr(Imm32(BaselineFrame::Size()), StackPointer);
++
++    Register framePtrScratch = regs.takeAny();
++    masm.movePtr(StackPointer, framePtrScratch);
++
++    // Reserve space for locals and stack values.
++    masm.x_sldi(scratch, numStackValues, 3);
++    masm.subPtr(scratch, StackPointer);
++
++    // Enter exit frame.
++    masm.reserveStack(3 * sizeof(uintptr_t));
++    masm.storePtr(ImmWord(MakeFrameDescriptor(FrameType::BaselineJS)),
++                  Address(StackPointer, 2 * sizeof(uintptr_t)));
++    masm.storePtr(ImmPtr(nullptr), Address(StackPointer, sizeof(uintptr_t)));
++    masm.storePtr(FramePointer, Address(StackPointer, 0));
++
++    // No GC things to mark, push a bare token.
++    masm.loadJSContext(scratch);
++    masm.enterFakeExitFrame(scratch, scratch, ExitFrameType::Bare);
++
++    masm.reserveStack(2 * sizeof(uintptr_t));
++    masm.storePtr(framePtr, Address(StackPointer, sizeof(uintptr_t)));
++    masm.storePtr(reg_code, Address(StackPointer, 0));
++
++    using Fn = void (*)(BaselineFrame* frame, InterpreterFrame* interpFrame,
++                        uint32_t numStackValues);
++    masm.setupUnalignedABICall(scratch);
++    masm.passABIArg(framePtrScratch);
++    masm.passABIArg(OsrFrameReg);
++    masm.passABIArg(numStackValues);
++    masm.callWithABI<Fn, jit::InitBaselineFrameForOsr>(
++        ABIType::General, CheckUnsafeCallWithABI::DontCheckHasExitFrame);
++
++    regs.add(OsrFrameReg);
++    Register jitcode = regs.takeAny();
++    masm.loadPtr(Address(StackPointer, 0), jitcode);
++    masm.loadPtr(Address(StackPointer, sizeof(uintptr_t)), framePtr);
++    masm.freeStack(2 * sizeof(uintptr_t));
++
++    masm.freeStack(ExitFrameLayout::SizeWithFooter());
++
++    // If OSR-ing, then emit instrumentation for setting lastProfilerFrame
++    // if profiler instrumentation is enabled.
++    {
++      Label skipProfilingInstrumentation;
++      AbsoluteAddress addressOfEnabled(
++          cx->runtime()->geckoProfiler().addressOfEnabled());
++      masm.branch32(Assembler::Equal, addressOfEnabled, Imm32(0),
++                    &skipProfilingInstrumentation);
++      masm.profilerEnterFrame(framePtr, scratch);
++      masm.bind(&skipProfilingInstrumentation);
++    }
++
++    masm.jump(jitcode);
++
++    masm.bind(&notOsr);
++    // Load the scope chain in R1.
++    MOZ_ASSERT(R1.scratchReg() != reg_code);
++    masm.movePtr(reg_chain, R1.scratchReg());
++  }
++
++  // The call will push the return address and frame pointer on the stack, thus
++  // we check that the stack would be aligned once the call is complete.
++  masm.assertStackAlignment(JitStackAlignment, 2 * sizeof(uintptr_t));
++
++  // Call the function with pushing return address to stack.
++  masm.callJitNoProfiler(reg_code);
++
++  {
++    // Interpreter -> Baseline OSR will return here.
++    masm.bind(&returnLabel);
++    masm.addCodeLabel(returnLabel);
++    masm.bind(&oomReturnLabel);
++  }
++
++  // Discard arguments and padding. Set sp to the address of the EnterJITRegs
++  // on the stack.
++  masm.movePtr(FramePointer, StackPointer);
++
++  // Store the returned value into the vp.
++  masm.as_ld(reg_vp, StackPointer, offsetof(EnterJITRegs, r10));
++  masm.storeValue(JSReturnOperand, Address(reg_vp, 0));
++
++  // Restore non-volatile registers and return.
++  GenerateReturn(masm);
++}
++
++// static
++mozilla::Maybe<::JS::ProfilingFrameIterator::RegisterState>
++JitRuntime::getCppEntryRegisters(JitFrameLayout* frameStackAddress) {
++  return mozilla::Nothing{};
++}
++
++void JitRuntime::generateInvalidator(MacroAssembler& masm, Label* bailoutTail) {
++  AutoCreatedBy acb(masm, "JitRuntime::generateInvalidator");
++
++  invalidatorOffset_ = startTrampolineCode(masm);
++
++  masm.checkStackAlignment();
++
++  // Push all registers so we can access them from [base + code].
++  masm.PushRegsInMask(AllRegs);
++
++  // Pass pointer to InvalidationBailoutStack structure.
++  masm.movePtr(StackPointer, IntArgReg0);
++
++  // Reserve place for BailoutInfo pointer. Two words to ensure alignment for
++  // setupAlignedABICall.
++  masm.subPtr(Imm32(2 * sizeof(uintptr_t)), StackPointer);
++  masm.movePtr(StackPointer, IntArgReg1);
++
++  using Fn = bool (*)(InvalidationBailoutStack* sp, BaselineBailoutInfo** info);
++  masm.setupAlignedABICall();
++  masm.passABIArg(IntArgReg0);
++  masm.passABIArg(IntArgReg1);
++  masm.callWithABI<Fn, InvalidationBailout>(
++      ABIType::General, CheckUnsafeCallWithABI::DontCheckOther);
++
++  masm.pop(IntArgReg2);
++
++  // Pop the machine state and the dead frame.
++  masm.moveToStackPtr(FramePointer);
++
++  // Jump to shared bailout tail. The BailoutInfo pointer has to be in
++  // IntArgReg2 (r5).
++  masm.jump(bailoutTail);
++}
++
++// When bailout is done via out of line code (lazy bailout).
++// Frame size is stored in LR (look at
++// CodeGeneratorPPC64::generateOutOfLineCode()) and thunk code should save it
++// on stack.
++static void PushBailoutFrame(MacroAssembler& masm, Register spArg) {
++  // Push the frameSize_ stored in LR.
++  // See: CodeGeneratorPPC64::generateOutOfLineCode()
++  masm.pushReturnAddress();
++
++  // Push registers such that we can access them from [base + code].
++  masm.PushRegsInMask(AllRegs);
++
++  // Put pointer to BailoutStack as first argument to the Bailout().
++  masm.movePtr(StackPointer, spArg);
++}
++
++static void GenerateBailoutThunk(MacroAssembler& masm, Label* bailoutTail) {
++  PushBailoutFrame(masm, IntArgReg0);
++
++  // Make space for Bailout's bailoutInfo outparam.
++  masm.reserveStack(sizeof(void*));
++  masm.movePtr(StackPointer, IntArgReg1);
++
++  // Call the bailout function.
++  using Fn = bool (*)(BailoutStack* sp, BaselineBailoutInfo** info);
++  masm.setupUnalignedABICall(IntArgReg2);
++  masm.passABIArg(IntArgReg0);
++  masm.passABIArg(IntArgReg1);
++  masm.callWithABI<Fn, Bailout>(ABIType::General,
++                                CheckUnsafeCallWithABI::DontCheckOther);
++
++  // Get the bailoutInfo outparam.
++  masm.pop(IntArgReg2);
++
++  // Remove both the bailout frame and the topmost Ion frame's stack.
++  masm.moveToStackPtr(FramePointer);
++
++  // Jump to shared bailout tail. The BailoutInfo pointer has to be in
++  // IntArgReg2 (r5).
++  masm.jump(bailoutTail);
++}
++
++void JitRuntime::generateBailoutHandler(MacroAssembler& masm,
++                                        Label* bailoutTail) {
++  AutoCreatedBy acb(masm, "JitRuntime::generateBailoutHandler");
++
++  bailoutHandlerOffset_ = startTrampolineCode(masm);
++
++  GenerateBailoutThunk(masm, bailoutTail);
++}
++
++bool JitRuntime::generateVMWrapper(JSContext* cx, MacroAssembler& masm,
++                                   VMFunctionId id, const VMFunctionData& f,
++                                   DynFn nativeFun, uint32_t* wrapperOffset) {
++  AutoCreatedBy acb(masm, "JitRuntime::generateVMWrapper");
++
++  *wrapperOffset = startTrampolineCode(masm);
++
++  // Avoid conflicts with argument registers while discarding the result after
++  // the function call.
++  AllocatableGeneralRegisterSet regs(Register::Codes::WrapperMask);
++
++  static_assert(
++      (Register::Codes::VolatileMask & ~Register::Codes::WrapperMask) == 0,
++      "Wrapper register set should be a superset of Volatile register set.");
++
++  // The context is the first argument; r3 is the first argument register.
++  Register cxreg = IntArgReg0;
++  regs.take(cxreg);
++
++  // On link-register platforms, it is the responsibility of the VM *callee* to
++  // push the return address, while the caller must ensure that the address
++  // is stored in LR on entry. This allows the VM wrapper to work with both
++  // direct calls and tail calls.
++  masm.pushReturnAddress();
++
++  // Push the frame pointer to finish the exit frame, then link it up.
++  masm.Push(FramePointer);
++  masm.moveStackPtrTo(FramePointer);
++  masm.loadJSContext(cxreg);
++  masm.enterExitFrame(cxreg, regs.getAny(), id);
++
++  // Reserve space for the outparameter.
++  masm.reserveVMFunctionOutParamSpace(f);
++
++  masm.setupUnalignedABICallDontSaveRestoreSP();
++  masm.passABIArg(cxreg);
++
++  size_t argDisp = ExitFrameLayout::Size();
++
++  // Copy any arguments.
++  for (uint32_t explicitArg = 0; explicitArg < f.explicitArgs; explicitArg++) {
++    switch (f.argProperties(explicitArg)) {
++      case VMFunctionData::WordByValue:
++        if (f.argPassedInFloatReg(explicitArg)) {
++          masm.passABIArg(MoveOperand(FramePointer, argDisp), ABIType::Float64);
++        } else {
++          masm.passABIArg(MoveOperand(FramePointer, argDisp), ABIType::General);
++        }
++        argDisp += sizeof(void*);
++        break;
++      case VMFunctionData::WordByRef:
++        masm.passABIArg(MoveOperand(FramePointer, argDisp,
++                                    MoveOperand::Kind::EffectiveAddress),
++                        ABIType::General);
++        argDisp += sizeof(void*);
++        break;
++      case VMFunctionData::DoubleByValue:
++      case VMFunctionData::DoubleByRef:
++        MOZ_CRASH("NYI: PPC64 callVM should not be used with 128bits values.");
++        break;
++    }
++  }
++
++  // Copy the implicit outparam, if any.
++  const int32_t outParamOffset =
++      -int32_t(ExitFooterFrame::Size()) - f.sizeOfOutParamStackSlot();
++  if (f.outParam != Type_Void) {
++    masm.passABIArg(MoveOperand(FramePointer, outParamOffset,
++                                MoveOperand::Kind::EffectiveAddress),
++                    ABIType::General);
++  }
++
++  masm.callWithABI(nativeFun, ABIType::General,
++                   CheckUnsafeCallWithABI::DontCheckHasExitFrame);
++
++  // Test for failure.
++  switch (f.failType()) {
++    case Type_Cell:
++      masm.branchTestPtr(Assembler::Zero, IntArgReg0, IntArgReg0,
++                         masm.failureLabel());
++      break;
++    case Type_Bool:
++      masm.branchIfFalseBool(IntArgReg0, masm.failureLabel());
++      break;
++    case Type_Void:
++      break;
++    default:
++      MOZ_CRASH("unknown failure kind");
++  }
++
++  // Load the outparam.
++  masm.loadVMFunctionOutParam(f, Address(FramePointer, outParamOffset));
++
++  // Pop frame and restore frame pointer.
++  masm.moveToStackPtr(FramePointer);
++  masm.pop(FramePointer);
++
++  // Return. Subtract sizeof(void*) for the frame pointer.
++  masm.retn(Imm32(sizeof(ExitFrameLayout) - sizeof(void*) +
++                  f.explicitStackSlots() * sizeof(void*) +
++                  f.extraValuesToPop * sizeof(Value)));
++
++  return true;
++}
++
++uint32_t JitRuntime::generatePreBarrier(JSContext* cx, MacroAssembler& masm,
++                                        MIRType type) {
++  AutoCreatedBy acb(masm, "JitRuntime::generatePreBarrier");
++
++  uint32_t offset = startTrampolineCode(masm);
++
++  MOZ_ASSERT(PreBarrierReg == IntArgReg1);  // r4
++  Register temp1 = IntArgReg0;              // r3
++  Register temp2 = IntArgReg2;              // r5
++  Register temp3 = IntArgReg3;              // r6
++  masm.push(temp1);
++  masm.push(temp2);
++  masm.push(temp3);
++
++  Label noBarrier;
++  masm.emitPreBarrierFastPath(type, temp1, temp2, temp3, &noBarrier);
++
++  // Call into C++ to mark this GC thing.
++  masm.pop(temp3);
++  masm.pop(temp2);
++  masm.pop(temp1);
++
++  LiveRegisterSet save;
++  save.set() = RegisterSet(GeneralRegisterSet(Registers::VolatileMask),
++                           FloatRegisterSet(FloatRegisters::VolatileMask));
++  // On PPC64, save LR since we'll be making a call.
++  masm.pushReturnAddress();
++  masm.PushRegsInMask(save);
++
++  masm.movePtr(ImmPtr(cx->runtime()), IntArgReg0);
++
++  masm.setupUnalignedABICall(IntArgReg2);
++  masm.passABIArg(IntArgReg0);
++  masm.passABIArg(IntArgReg1);
++  masm.callWithABI(JitPreWriteBarrier(type));
++
++  masm.PopRegsInMask(save);
++  masm.ret();
++
++  masm.bind(&noBarrier);
++  masm.pop(temp3);
++  masm.pop(temp2);
++  masm.pop(temp1);
++  masm.abiret();
++
++  return offset;
++}
++
++void JitRuntime::generateBailoutTailStub(MacroAssembler& masm,
++                                         Label* bailoutTail) {
++  AutoCreatedBy acb(masm, "JitRuntime::generateBailoutTailStub");
++
++  masm.bind(bailoutTail);
++  masm.generateBailoutTail(IntArgReg1, IntArgReg2);
++}
+diff --git a/js/src/jit/shared/Assembler-shared.h b/js/src/jit/shared/Assembler-shared.h
+index d5fed2fabe31..490a9f5391e0 100644
+--- a/js/src/jit/shared/Assembler-shared.h
++++ b/js/src/jit/shared/Assembler-shared.h
+@@ -30,14 +30,15 @@
+ 
+ #if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64) ||      \
+     defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+-    defined(JS_CODEGEN_WASM32) || defined(JS_CODEGEN_RISCV64)
++    defined(JS_CODEGEN_WASM32) || defined(JS_CODEGEN_RISCV64) || \
++    defined(JS_CODEGEN_PPC64)
+ // Push return addresses callee-side.
+ #  define JS_USE_LINK_REGISTER
+ #endif
+ 
+ #if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_ARM64) ||    \
+     defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || \
+-    defined(JS_CODEGEN_ARM)
++    defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_PPC64)
+ // JS_CODELABEL_LINKMODE gives labels additional metadata
+ // describing how Bind() should patch them.
+ #  define JS_CODELABEL_LINKMODE
+diff --git a/js/src/jit/shared/AtomicOperations-feeling-lucky-gcc.h b/js/src/jit/shared/AtomicOperations-feeling-lucky-gcc.h
+index a6909e560bef..d886cba2c7e6 100644
+--- a/js/src/jit/shared/AtomicOperations-feeling-lucky-gcc.h
++++ b/js/src/jit/shared/AtomicOperations-feeling-lucky-gcc.h
+@@ -46,7 +46,8 @@
+ // code in this file.
+ 
+ #if defined(JS_SIMULATOR_ARM64) || defined(JS_SIMULATOR_ARM) || \
+-    defined(JS_SIMULATOR_MIPS64) || defined(JS_SIMULATOR_LOONG64)
++    defined(JS_SIMULATOR_MIPS64) || defined(JS_SIMULATOR_LOONG64) || \
++    defined(JS_SIMULATOR_PPC64)
+ // On some x86 (32-bit) systems this will not work because the compiler does not
+ // open-code 64-bit atomics.  If so, try linking with -latomic.  If that doesn't
+ // work, you're mostly on your own.
+diff --git a/js/src/jit/shared/CodeGenerator-shared.cpp b/js/src/jit/shared/CodeGenerator-shared.cpp
+index ada87f1f11a2..14468356cf31 100644
+--- a/js/src/jit/shared/CodeGenerator-shared.cpp
++++ b/js/src/jit/shared/CodeGenerator-shared.cpp
+@@ -86,8 +86,8 @@ CodeGeneratorShared::CodeGeneratorShared(MIRGenerator* gen, LIRGraph* graph,
+ 
+ #ifdef ENABLE_WASM_SIMD
+ #  if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_X86) || \
+-      defined(JS_CODEGEN_ARM64)
+-    // On X64/x86 and ARM64, we don't need alignment for Wasm SIMD at this time.
++      defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_PPC64)
++    // On X64/x86, ARM64, and PPC64, we don't need alignment for Wasm SIMD at this time.
+ #  else
+ #    error \
+         "we may need padding so that local slots are SIMD-aligned and the stack must be kept SIMD-aligned too."
+@@ -1075,7 +1075,7 @@ Label* CodeGeneratorShared::getJumpLabelForBranch(MBasicBlock* block) {
+ // This function is not used for MIPS64/LOONG64/RISCV64. They have
+ // branchToBlock.
+ #if !defined(JS_CODEGEN_MIPS64) && !defined(JS_CODEGEN_LOONG64) && \
+-    !defined(JS_CODEGEN_RISCV64)
++    !defined(JS_CODEGEN_RISCV64) && !defined(JS_CODEGEN_PPC64)
+ void CodeGeneratorShared::jumpToBlock(MBasicBlock* mir,
+                                       Assembler::Condition cond) {
+   // Skip past trivial blocks.
+diff --git a/js/src/jit/shared/Lowering-shared-inl.h b/js/src/jit/shared/Lowering-shared-inl.h
+index bdcc1da7d41a..b62f8f681df1 100644
+--- a/js/src/jit/shared/Lowering-shared-inl.h
++++ b/js/src/jit/shared/Lowering-shared-inl.h
+@@ -527,7 +527,7 @@ LAllocation LIRGeneratorShared::useRegisterOrNonDoubleConstant(
+ 
+ #if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64) ||      \
+     defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_MIPS64) || \
+-    defined(JS_CODEGEN_RISCV64)
++    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ LAllocation LIRGeneratorShared::useAnyOrConstant(MDefinition* mir) {
+   return useRegisterOrConstant(mir);
+ }
+diff --git a/js/src/js-config.mozbuild b/js/src/js-config.mozbuild
+index 22becaf4ecfb..ff5294825e9d 100644
+--- a/js/src/js-config.mozbuild
++++ b/js/src/js-config.mozbuild
+@@ -8,6 +8,7 @@ if (
+     CONFIG["JS_CODEGEN_X64"]
+     or CONFIG["JS_CODEGEN_ARM64"]
+     or CONFIG["JS_CODEGEN_RISCV64"]
++    or CONFIG["JS_CODEGEN_PPC64"]
+ ):
+     DEFINES["WASM_SUPPORTS_HUGE_MEMORY"] = True
+ 
+diff --git a/js/src/jsapi-tests/testJitABIcalls.cpp b/js/src/jsapi-tests/testJitABIcalls.cpp
+index b5c03a47dd83..887ad9e3d959 100644
+--- a/js/src/jsapi-tests/testJitABIcalls.cpp
++++ b/js/src/jsapi-tests/testJitABIcalls.cpp
+@@ -718,6 +718,9 @@ class JitABICall final : public jsapitest::RuntimeTest,
+ #elif defined(JS_CODEGEN_RISCV64)
+     Register base = t0;
+     regs.take(base);
++#elif defined(JS_CODEGEN_PPC64)
++    Register base = r11;
++    regs.take(base);
+ #else
+ #  error "Unknown architecture!"
+ #endif
+diff --git a/js/src/jsapi-tests/testWasmReturnCalls.cpp b/js/src/jsapi-tests/testWasmReturnCalls.cpp
+index 4728f2404ae8..a07ddb2f214e 100644
+--- a/js/src/jsapi-tests/testWasmReturnCalls.cpp
++++ b/js/src/jsapi-tests/testWasmReturnCalls.cpp
+@@ -32,7 +32,10 @@ BEGIN_TEST(testWasmCheckSlowCallMarkerHit) {
+ 
+   masm.bind(&check);
+ #  ifdef JS_USE_LINK_REGISTER
+-#    if !defined(JS_CODEGEN_LOONG64) && !defined(JS_CODEGEN_MIPS64) && \
++#    if defined(JS_CODEGEN_PPC64)
++  static constexpr Register ra = ABINonArgReg3;
++  masm.xs_mflr(ra);
++#    elif !defined(JS_CODEGEN_LOONG64) && !defined(JS_CODEGEN_MIPS64) && \
+         !defined(JS_CODEGEN_RISCV64)
+   static constexpr Register ra = lr;
+ #    endif
+@@ -70,7 +73,10 @@ BEGIN_TEST(testWasmCheckSlowCallMarkerMiss) {
+ 
+   masm.bind(&check);
+ #  ifdef JS_USE_LINK_REGISTER
+-#    if !defined(JS_CODEGEN_LOONG64) && !defined(JS_CODEGEN_MIPS64) && \
++#    if defined(JS_CODEGEN_PPC64)
++  static constexpr Register ra = ABINonArgReg3;
++  masm.xs_mflr(ra);
++#    elif !defined(JS_CODEGEN_LOONG64) && !defined(JS_CODEGEN_MIPS64) && \
+         !defined(JS_CODEGEN_RISCV64)
+   static constexpr Register ra = lr;
+ #    endif
+diff --git a/js/src/jsapi-tests/testsJit.cpp b/js/src/jsapi-tests/testsJit.cpp
+index a2dfe5d0196c..7f3dcca895d2 100644
+--- a/js/src/jsapi-tests/testsJit.cpp
++++ b/js/src/jsapi-tests/testsJit.cpp
+@@ -25,6 +25,14 @@ void PrepareJit(js::jit::MacroAssembler& masm) {
+ #if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+     defined(JS_CODEGEN_RISCV64)
+   save.add(js::jit::ra);
++#elif defined(JS_CODEGEN_PPC64)
++  // LR on PPC64 isn't a GPR; save it to the stack manually.
++  {
++    UseScratchRegisterScope temps(masm);
++    Register scratch = temps.Acquire();
++    masm.xs_mflr(scratch);
++    masm.as_stdu(scratch, StackPointer, -8);
++  }
+ #elif defined(JS_USE_LINK_REGISTER)
+   save.add(js::jit::lr);
+ #endif
+@@ -44,6 +52,8 @@ bool ExecuteJit(JSContext* cx, js::jit::MacroAssembler& masm) {
+ #if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+     defined(JS_CODEGEN_RISCV64)
+   restore.add(js::jit::ra);
++#elif defined(JS_CODEGEN_PPC64)
++  // LR will be restored manually after PopRegsInMask.
+ #elif defined(JS_USE_LINK_REGISTER)
+   restore.add(js::jit::lr);
+ #endif
+@@ -55,6 +65,16 @@ bool ExecuteJit(JSContext* cx, js::jit::MacroAssembler& masm) {
+ 
+   // Reset stack pointer.
+   masm.SetStackPointer64(PseudoStackPointer64);
++#elif defined(JS_CODEGEN_PPC64)
++  // Restore LR from the stack and return.
++  {
++    UseScratchRegisterScope temps(masm);
++    Register scratch = temps.Acquire();
++    masm.as_ld(scratch, StackPointer, 0);
++    masm.xs_mtlr(scratch);
++    masm.as_addi(StackPointer, StackPointer, 8);
++  }
++  masm.as_blr();
+ #else
+   // Exit the JIT-ed code using the ABI return style.
+   masm.abiret();
+diff --git a/js/src/shell/js.cpp b/js/src/shell/js.cpp
+index 45bc0796b964..20eb1231bb7f 100644
+--- a/js/src/shell/js.cpp
++++ b/js/src/shell/js.cpp
+@@ -7895,6 +7895,13 @@ static void SingleStepCallback(void* arg, jit::Simulator* sim, void* pc) {
+   state.fp = (void*)sim->getRegister(jit::Simulator::fp);
+   // see WasmTailCallFPScratchReg and CollapseWasmFrameFast
+   state.tempFP = (void*)sim->getRegister(jit::Simulator::t3);
++#  elif defined(JS_SIMULATOR_PPC64)
++  state.sp = (void*)sim->getRegister(jit::Simulator::sp);
++  state.lr = (void*)sim->getLR();
++  state.fp = (void*)sim->getRegister(jit::Simulator::fp);
++  // WasmTailCallFPScratchReg = ABINonArgReg3 = r22 holds the unwind FP
++  // during the wasm tail-call collapse window (RestoreFpRa unwind info).
++  state.tempFP = (void*)sim->getRegister(jit::Simulator::r22);
+ #  else
+ #    error "NYI: Single-step profiling support"
+ #  endif
+@@ -13144,6 +13151,15 @@ bool InitOptionParser(OptionParser& op) {
+                        "Stop the RISC-V simulator after the given "
+                        "NUMBER of instructions.",
+                        -1) ||
++#endif
++#ifdef JS_SIMULATOR_PPC64
++      !op.addBoolOption('\0', "ppc64-sim-icache-checks",
++                        "Enable icache flush checks in the PPC64 "
++                        "simulator.") ||
++      !op.addIntOption('\0', "ppc64-sim-stop-at", "NUMBER",
++                       "Stop the PPC64 simulator after the given "
++                       "NUMBER of instructions.",
++                       -1) ||
+ #endif
+       !op.addIntOption('\0', "nursery-size", "SIZE-MB",
+                        "Set the maximum nursery size in MB",
+@@ -14235,6 +14251,15 @@ bool SetContextJITOptions(JSContext* cx, const OptionParser& op) {
+   if (stopAt >= 0) {
+     jit::Simulator::StopSimAt = stopAt;
+   }
++#elif defined(JS_SIMULATOR_PPC64)
++  if (op.getBoolOption("ppc64-sim-icache-checks")) {
++    jit::SimulatorProcess::ICacheCheckingDisableCount = 0;
++  }
++
++  int32_t stopAt = op.getIntOption("ppc64-sim-stop-at");
++  if (stopAt >= 0) {
++    jit::Simulator::StopSimAt = stopAt;
++  }
+ #endif
+ 
+ #ifdef DEBUG
+diff --git a/js/src/shell/jsshell.h b/js/src/shell/jsshell.h
+index e8d47ba6888c..57e2b15f3cdd 100644
+--- a/js/src/shell/jsshell.h
++++ b/js/src/shell/jsshell.h
+@@ -22,7 +22,8 @@
+ 
+ // Some platform hooks must be implemented for single-step profiling.
+ #if defined(JS_SIMULATOR_ARM) || defined(JS_SIMULATOR_MIPS64) || \
+-    defined(JS_SIMULATOR_ARM64) || defined(JS_SIMULATOR_LOONG64)
++    defined(JS_SIMULATOR_ARM64) || defined(JS_SIMULATOR_LOONG64) || \
++    defined(JS_SIMULATOR_RISCV64) || defined(JS_SIMULATOR_PPC64)
+ #  define SINGLESTEP_PROFILING
+ #endif
+ 
+diff --git a/js/src/tests/shell/os.js b/js/src/tests/shell/os.js
+index 929982756548..f3d2396b17eb 100644
+--- a/js/src/tests/shell/os.js
++++ b/js/src/tests/shell/os.js
+@@ -20,7 +20,13 @@ var info = os.waitpid(kidpid, true);
+ assertEq(info.hasOwnProperty("pid"), false);
+ assertEq(info.hasOwnProperty("exitStatus"), false);
+ 
+-os.kill(kidpid);
++// Use SIGKILL (9) instead of the default SIGINT: under heavy parallel test
++// load, SIGINT delivery can race with the child's signal-handler setup and
++// the kernel's reaping path, leading to waitpid below blocking until the
++// `sleep 60` exits normally. SIGKILL is uncatchable and forces immediate
++// termination, so the assertion below ("killed process should not have
++// exitStatus") is reliable.
++os.kill(kidpid, 9);
+ 
+ info = os.waitpid(kidpid);
+ assertEq(info.hasOwnProperty("pid"), true, "waiting on dead process should return pid");
+diff --git a/js/src/util/Poison.h b/js/src/util/Poison.h
+index 721ecff6149d..de7981aa6f60 100644
+--- a/js/src/util/Poison.h
++++ b/js/src/util/Poison.h
+@@ -92,6 +92,8 @@ const uint8_t JS_SCOPE_DATA_TRAILING_NAMES_PATTERN = 0xCC;
+ #elif defined(JS_CODEGEN_RISCV64)
+ #  define JS_SWEPT_CODE_PATTERN \
+     0x29  // illegal sb instruction, crashes in user mode.
++#elif defined(JS_CODEGEN_PPC64)
++#  define JS_SWEPT_CODE_PATTERN 0x00  // illegal instruction (all zeros)
+ #else
+ #  error "JS_SWEPT_CODE_PATTERN not defined for this platform"
+ #endif
+diff --git a/js/src/wasm/WasmAnyRef.h b/js/src/wasm/WasmAnyRef.h
+index f81d4c6171b6..7200e9ab0e23 100644
+--- a/js/src/wasm/WasmAnyRef.h
++++ b/js/src/wasm/WasmAnyRef.h
+@@ -209,7 +209,7 @@ class AnyRef {
+     // Truncate the value to the 31-bit value size.
+     uintptr_t wideValue = uintptr_t(value & 0x7FFFFFFF);
+ #elif defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_MIPS64) || \
+-    defined(JS_CODEGEN_RISCV64)
++    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+     // Sign extend the value to the native pointer size.
+     uintptr_t wideValue = uintptr_t(int64_t((uint64_t(value) << 33)) >> 33);
+ #elif !defined(JS_64BIT)
+@@ -234,6 +234,11 @@ class AnyRef {
+ #  ifdef DEBUG
+ #    if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM64)
+     MOZ_ASSERT(value <= UINT32_MAX);
++#    elif defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_MIPS64) || \
++        defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
++    // On sign-extending platforms, a canonical i32 must be the sign
++    // extension of its low 32 bits.
++    MOZ_ASSERT(value == uintptr_t(int64_t(int32_t(value))));
+ #    endif
+ #  endif
+   }
+diff --git a/js/src/wasm/WasmBCDefs.h b/js/src/wasm/WasmBCDefs.h
+index b44e91e28693..66a8c9afe8c6 100644
+--- a/js/src/wasm/WasmBCDefs.h
++++ b/js/src/wasm/WasmBCDefs.h
+@@ -44,6 +44,9 @@
+ #if defined(JS_CODEGEN_RISCV64)
+ #  include "jit/riscv64/Assembler-riscv64.h"
+ #endif
++#if defined(JS_CODEGEN_PPC64)
++#  include "jit/ppc64/Assembler-ppc64.h"
++#endif
+ #include "js/ScalarType.h"
+ #include "util/Memory.h"
+ #include "wasm/WasmCodegenTypes.h"
+@@ -151,6 +154,10 @@ enum class RhsDestOp { True = true };
+ #  define RABALDR_PIN_INSTANCE
+ #endif
+ 
++#ifdef JS_CODEGEN_PPC64
++#  define RABALDR_PIN_INSTANCE
++#endif
++
+ // Max number of pushes onto the value stack for any opcode or emitter that
+ // does not push a variable, unbounded amount (anything with multiple
+ // results).  This includes also intermediate pushes such as values pushed as
+diff --git a/js/src/wasm/WasmBCMemory.cpp b/js/src/wasm/WasmBCMemory.cpp
+index 835512b09b8c..9137b09f4684 100644
+--- a/js/src/wasm/WasmBCMemory.cpp
++++ b/js/src/wasm/WasmBCMemory.cpp
+@@ -372,7 +372,7 @@ void BaseCompiler::boundsCheckBelow4GBAccess(uint32_t memoryIndex,
+ // Make sure the ptr could be used as an index register.
+ static inline void ToValidIndex(MacroAssembler& masm, RegI32 ptr) {
+ #if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+-    defined(JS_CODEGEN_RISCV64)
++    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+   // When ptr is used as an index, it will be added to a 64-bit register.
+   // So we should explicitly promote ptr to 64-bit. Since now ptr holds a
+   // unsigned 32-bit value, we zero-extend it to 64-bit here.
+@@ -645,6 +645,13 @@ void BaseCompiler::executeLoad(MemoryAccessDesc* access, AccessCheck* check,
+   } else {
+     masm.wasmLoad(*access, memoryBase, ptr, ptr, dest.any());
+   }
++#elif defined(JS_CODEGEN_PPC64)
++  MOZ_ASSERT(temp.isInvalid());
++  if (dest.tag == AnyReg::I64) {
++    masm.wasmLoadI64(*access, memoryBase, ptr, ptr, dest.i64());
++  } else {
++    masm.wasmLoad(*access, memoryBase, ptr, ptr, dest.any());
++  }
+ #else
+   MOZ_CRASH("BaseCompiler platform hook: load");
+ #endif
+@@ -675,10 +682,11 @@ void BaseCompiler::load(MemoryAccessDesc* access, AccessCheck* check,
+   // generated is the same for the 64-bit and the 32-bit case.
+   return executeLoad(access, check, instance, memoryBase, RegI32(ptr.reg), dest,
+                      maybeFromI64(temp));
+-#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64)
+-  // On mips64 and loongarch64, the 'prepareMemoryAccess' function will make
+-  // sure that ptr holds a valid 64-bit index value. Thus the code generated in
+-  // 'executeLoad' is the same for the 64-bit and the 32-bit case.
++#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
++    defined(JS_CODEGEN_PPC64)
++  // On mips64, loongarch64, and ppc64, the 'prepareMemoryAccess' function will
++  // make sure that ptr holds a valid 64-bit index value. Thus the code
++  // generated in 'executeLoad' is the same for the 64-bit and the 32-bit case.
+   return executeLoad(access, check, instance, memoryBase, RegI32(ptr.reg), dest,
+                      maybeFromI64(temp));
+ #elif defined(JS_CODEGEN_RISCV64)
+@@ -788,6 +796,13 @@ void BaseCompiler::executeStore(MemoryAccessDesc* access, AccessCheck* check,
+   } else {
+     masm.wasmStore(*access, src.any(), memoryBase, ptr, ptr);
+   }
++#elif defined(JS_CODEGEN_PPC64)
++  MOZ_ASSERT(temp.isInvalid());
++  if (access->type() == Scalar::Int64) {
++    masm.wasmStoreI64(*access, src.i64(), memoryBase, ptr, ptr);
++  } else {
++    masm.wasmStore(*access, src.any(), memoryBase, ptr, ptr);
++  }
+ #else
+   MOZ_CRASH("BaseCompiler platform hook: store");
+ #endif
+@@ -812,7 +827,7 @@ void BaseCompiler::store(MemoryAccessDesc* access, AccessCheck* check,
+                       maybeFromI64(temp));
+ #elif defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM64) ||    \
+     defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+-    defined(JS_CODEGEN_RISCV64)
++    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+   return executeStore(access, check, instance, memoryBase, RegI32(ptr.reg), src,
+                       maybeFromI64(temp));
+ #else
+@@ -1295,7 +1310,8 @@ static void Deallocate(BaseCompiler* bc, RegI32 rv, const Temps& temps) {
+   bc->freeI32(temps.t0);
+ }
+ 
+-#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64)
++#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
++    defined(JS_CODEGEN_PPC64)
+ 
+ struct Temps {
+   RegI32 t0, t1, t2;
+@@ -1504,7 +1520,7 @@ static void Deallocate(BaseCompiler* bc, AtomicOp op, RegI64 rv, RegI64 temp) {
+ }
+ 
+ #elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
+-    defined(JS_CODEGEN_LOONG64)
++    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_PPC64)
+ 
+ static void PopAndAllocate(BaseCompiler* bc, AtomicOp op, RegI64* rd,
+                            RegI64* rv, RegI64* temp) {
+@@ -1678,7 +1694,8 @@ static void Deallocate(BaseCompiler* bc, RegI32 rv, const Temps&) {
+   bc->freeI32(rv);
+ }
+ 
+-#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64)
++#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
++    defined(JS_CODEGEN_PPC64)
+ 
+ struct Temps {
+   RegI32 t0, t1, t2;
+@@ -1844,7 +1861,7 @@ static void Deallocate(BaseCompiler* bc, RegI64 rd, RegI64 rv) {
+ }
+ 
+ #elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
+-    defined(JS_CODEGEN_LOONG64)
++    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_PPC64)
+ 
+ static void PopAndAllocate(BaseCompiler* bc, RegI64* rd, RegI64* rv) {
+   *rv = bc->popI64();
+@@ -2017,7 +2034,8 @@ static void Deallocate(BaseCompiler* bc, RegI32 rexpect, RegI32 rnew,
+   bc->freeI32(rexpect);
+ }
+ 
+-#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64)
++#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
++    defined(JS_CODEGEN_PPC64)
+ 
+ struct Temps {
+   RegI32 t0, t1, t2;
+@@ -2287,7 +2305,7 @@ static void Deallocate(BaseCompiler* bc, RegI64 rexpect, RegI64 rnew) {
+ }
+ 
+ #elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
+-    defined(JS_CODEGEN_LOONG64)
++    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_PPC64)
+ 
+ template <typename RegAddressType>
+ static void PopAndAllocate(BaseCompiler* bc, RegI64* rexpect, RegI64* rnew,
+@@ -2885,6 +2903,11 @@ void BaseCompiler::loadExtend(MemoryAccessDesc* access, Scalar::Type viewType) {
+   RegI64 rs = popI64();
+   RegV128 rd = needV128();
+   masm.moveGPR64ToDouble(rs, rd);
++#ifdef JS_CODEGEN_PPC64
++  // mtvsrd places value in BE dw0 (= LE dw1). widenLow* operates on LE dw0.
++  // Swap dwords to move loaded data to the correct half.
++  masm.as_xxpermdi(rd, rd, rd, 2);
++#endif
+   switch (viewType) {
+     case Scalar::Int8:
+       masm.widenLowInt8x16(rd, rd);
+diff --git a/js/src/wasm/WasmBCRegDefs.h b/js/src/wasm/WasmBCRegDefs.h
+index bb84f0863de2..fd37bd464f39 100644
+--- a/js/src/wasm/WasmBCRegDefs.h
++++ b/js/src/wasm/WasmBCRegDefs.h
+@@ -118,6 +118,13 @@ static constexpr Register RabaldrScratchI32 = CallTempReg2;
+ static constexpr Register RabaldrScratchI32 = CallTempReg2;
+ #endif
+ 
++#ifdef JS_CODEGEN_PPC64
++#  define RABALDR_SCRATCH_I32
++// Use r25 (callee-saved, non-arg, not used by any wasm infrastructure)
++// instead of CallTempReg2 (r10) which is IntArgReg7.
++static constexpr Register RabaldrScratchI32 = r25;
++#endif
++
+ #ifdef RABALDR_SCRATCH_F32_ALIASES_F64
+ #  if !defined(RABALDR_SCRATCH_F32) || !defined(RABALDR_SCRATCH_F64)
+ #    error "Bad configuration"
+@@ -386,8 +393,9 @@ struct SpecificRegs {
+ 
+   SpecificRegs() : abiReturnRegI64(ReturnReg64) {}
+ };
+-#elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
+-    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) ||  \
++    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || \
++    defined(JS_CODEGEN_PPC64)
+ struct SpecificRegs {
+   // Required by gcc.
+   SpecificRegs() {}
+diff --git a/js/src/wasm/WasmBaselineCompile.cpp b/js/src/wasm/WasmBaselineCompile.cpp
+index 2af7ad7f583b..c57180dd362b 100644
+--- a/js/src/wasm/WasmBaselineCompile.cpp
++++ b/js/src/wasm/WasmBaselineCompile.cpp
+@@ -376,11 +376,15 @@ void BaseCompiler::tableSwitch(Label* theTable, RegI32 switchValue,
+   masm.ma_ldr(DTRAddr(scratch, DtrRegImmShift(switchValue, LSL, 2)), pc, Offset,
+               Assembler::Always);
+ #elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+-    defined(JS_CODEGEN_RISCV64)
++    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+   ScratchI32 scratch(*this);
+   CodeLabel tableCl;
+ 
++#  if defined(JS_CODEGEN_PPC64)
++  masm.mov(&tableCl, scratch);
++#  else
+   masm.ma_li(scratch, &tableCl);
++#  endif
+ 
+   tableCl.target()->bind(theTable->offset());
+   masm.addCodeLabel(tableCl);
+@@ -898,7 +902,7 @@ void BaseCompiler::insertBreakablePoint(CallSiteKind kind) {
+   masm.append(CallSiteDesc(iter_.lastOpcodeOffset(), kind),
+               CodeOffset(masm.currentOffset()));
+ #elif defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_MIPS64) || \
+-    defined(JS_CODEGEN_RISCV64)
++    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+   ScratchPtr scratch(*this);
+   Label L;
+   masm.loadPtr(Address(InstanceReg, Instance::offsetOfDebugStub()), scratch);
+@@ -972,7 +976,7 @@ void BaseCompiler::insertPerFunctionDebugStub() {
+     masm.ma_bx(lr, Assembler::Zero);
+   }
+ #elif defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_MIPS64) || \
+-    defined(JS_CODEGEN_RISCV64)
++    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+   {
+     ScratchPtr scratch(*this);
+ 
+@@ -1403,7 +1407,7 @@ void BaseCompiler::popStackResults(ABIResultIter& iter, StackHeight stackBase) {
+     switch (v.kind()) {
+       case Stk::ConstI32:
+ #if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+-    defined(JS_CODEGEN_RISCV64)
++    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+         fr.storeImmediatePtrToStack(v.i32val_, resultHeight, temp);
+ #else
+         fr.storeImmediatePtrToStack(uint32_t(v.i32val_), resultHeight, temp);
+@@ -1723,6 +1727,13 @@ void BaseCompiler::passArg(ValType type, const Stk& arg, FunctionCall* call) {
+                                       argLoc.offsetFromArgBase()));
+       } else {
+         loadI32(arg, RegI32(argLoc.gpr()));
++#ifdef JS_CODEGEN_PPC64
++        // addi can sign-extend, which yields wrong values when the C++
++        // callee expects a uint32_t. Clear the upper 32 bits.
++        if (call->abiKind == ABIKind::System) {
++          masm.as_rldicl(argLoc.gpr(), argLoc.gpr(), 0, 32);
++        }
++#endif
+       }
+       break;
+     }
+@@ -2372,9 +2383,10 @@ void BaseCompiler::finishTryNote(size_t tryNoteIndex) {
+ RegI32 BaseCompiler::needRotate64Temp() {
+ #if defined(JS_CODEGEN_X86)
+   return needI32();
+-#elif defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM) ||    \
+-    defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
+-    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#elif defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM) ||       \
++    defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) ||    \
++    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || \
++    defined(JS_CODEGEN_PPC64)
+   return RegI32::Invalid();
+ #else
+   MOZ_CRASH("BaseCompiler platform hook: needRotate64Temp");
+@@ -2433,6 +2445,8 @@ void BaseCompiler::popAndAllocateForMulI64(RegI64* r0, RegI64* r1,
+   pop2xI64(r0, r1);
+ #elif defined(JS_CODEGEN_RISCV64)
+   pop2xI64(r0, r1);
++#elif defined(JS_CODEGEN_PPC64)
++  pop2xI64(r0, r1);
+ #else
+   MOZ_CRASH("BaseCompiler porting interface: popAndAllocateForMulI64");
+ #endif
+@@ -2866,6 +2880,9 @@ static RegI32 PopcntTemp(BaseCompiler& bc) {
+     defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+     defined(JS_CODEGEN_RISCV64)
+   return bc.needI32();
++#elif defined(JS_CODEGEN_PPC64)
++  // PPC64 has native popcntd/popcntw; no temp register needed.
++  return RegI32::Invalid();
+ #else
+   MOZ_CRASH("BaseCompiler platform hook: PopcntTemp");
+ #endif
+@@ -9362,6 +9379,11 @@ static void MulI64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd,
+                      RegV128 temp1, RegV128 temp2) {
+   masm.mulInt64x2(rsd, rs, rsd, temp1, temp2);
+ }
++#  elif defined(JS_CODEGEN_PPC64)
++static void MulI64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd,
++                     RegV128 temp1, RegV128 temp2) {
++  masm.mulInt64x2(rsd, rs, rsd, temp1, temp2);
++}
+ #  endif
+ 
+ static void MulF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
+@@ -9376,7 +9398,8 @@ static void DivF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
+   masm.divFloat64x2(rsd, rs, rsd);
+ }
+ 
+-#  if defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64)
++#  if defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64) || \
++      defined(JS_CODEGEN_PPC64)
+ static void MinF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rsd,
+                      RegV128 temp1, RegV128 temp2) {
+   masm.minFloat32x4(rsd, rs, rsd, temp1, temp2);
+@@ -9397,6 +9420,22 @@ static void MaxF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd,
+   masm.maxFloat64x2(rsd, rs, rsd, temp1, temp2);
+ }
+ 
++#  if defined(JS_CODEGEN_PPC64)
++// PPC64: use non-RhsDestOp convention (first=rhs, second=lhsDest),
++// matching the pseudoMin/Max function signature.
++static void PMinF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
++  masm.pseudoMinFloat32x4(rs, rsd);
++}
++static void PMinF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
++  masm.pseudoMinFloat64x2(rs, rsd);
++}
++static void PMaxF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
++  masm.pseudoMaxFloat32x4(rs, rsd);
++}
++static void PMaxF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
++  masm.pseudoMaxFloat64x2(rs, rsd);
++}
++#  else
+ static void PMinF32x4(MacroAssembler& masm, RegV128 rsd, RegV128 rs,
+                       RhsDestOp) {
+   masm.pseudoMinFloat32x4(rsd, rs);
+@@ -9416,6 +9455,7 @@ static void PMaxF64x2(MacroAssembler& masm, RegV128 rsd, RegV128 rs,
+                       RhsDestOp) {
+   masm.pseudoMaxFloat64x2(rsd, rs);
+ }
++#  endif
+ #  elif defined(JS_CODEGEN_ARM64)
+ static void MinF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
+   masm.minFloat32x4(rs, rsd);
+@@ -9806,6 +9846,68 @@ static void ShiftRightI64x2(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
+   masm.rightShiftInt64x2(rsd, temp, rsd);
+ }
+ 
++static void ShiftRightUI64x2(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++                             RegI32 temp) {
++  ShiftOpMask(masm, SimdOp::I64x2ShrU, rs, temp);
++  masm.unsignedRightShiftInt64x2(rsd, temp, rsd);
++}
++#  elif defined(JS_CODEGEN_PPC64)
++// PPC64: same as ARM64 pattern (shift amount in GPR, result in vector reg)
++static void ShiftLeftI8x16(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++                           RegI32 temp) {
++  ShiftOpMask(masm, SimdOp::I8x16Shl, rs, temp);
++  masm.leftShiftInt8x16(rsd, temp, rsd);
++}
++static void ShiftLeftI16x8(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++                           RegI32 temp) {
++  ShiftOpMask(masm, SimdOp::I16x8Shl, rs, temp);
++  masm.leftShiftInt16x8(rsd, temp, rsd);
++}
++static void ShiftLeftI32x4(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++                           RegI32 temp) {
++  ShiftOpMask(masm, SimdOp::I32x4Shl, rs, temp);
++  masm.leftShiftInt32x4(rsd, temp, rsd);
++}
++static void ShiftLeftI64x2(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++                           RegI32 temp) {
++  ShiftOpMask(masm, SimdOp::I64x2Shl, rs, temp);
++  masm.leftShiftInt64x2(rsd, temp, rsd);
++}
++static void ShiftRightI8x16(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++                            RegI32 temp) {
++  ShiftOpMask(masm, SimdOp::I8x16ShrS, rs, temp);
++  masm.rightShiftInt8x16(rsd, temp, rsd);
++}
++static void ShiftRightUI8x16(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++                             RegI32 temp) {
++  ShiftOpMask(masm, SimdOp::I8x16ShrU, rs, temp);
++  masm.unsignedRightShiftInt8x16(rsd, temp, rsd);
++}
++static void ShiftRightI16x8(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++                            RegI32 temp) {
++  ShiftOpMask(masm, SimdOp::I16x8ShrS, rs, temp);
++  masm.rightShiftInt16x8(rsd, temp, rsd);
++}
++static void ShiftRightUI16x8(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++                             RegI32 temp) {
++  ShiftOpMask(masm, SimdOp::I16x8ShrU, rs, temp);
++  masm.unsignedRightShiftInt16x8(rsd, temp, rsd);
++}
++static void ShiftRightI32x4(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++                            RegI32 temp) {
++  ShiftOpMask(masm, SimdOp::I32x4ShrS, rs, temp);
++  masm.rightShiftInt32x4(rsd, temp, rsd);
++}
++static void ShiftRightUI32x4(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++                             RegI32 temp) {
++  ShiftOpMask(masm, SimdOp::I32x4ShrU, rs, temp);
++  masm.unsignedRightShiftInt32x4(rsd, temp, rsd);
++}
++static void ShiftRightI64x2(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++                            RegI32 temp) {
++  ShiftOpMask(masm, SimdOp::I64x2ShrS, rs, temp);
++  masm.rightShiftInt64x2(rsd, temp, rsd);
++}
+ static void ShiftRightUI64x2(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
+                              RegI32 temp) {
+   ShiftOpMask(masm, SimdOp::I64x2ShrU, rs, temp);
+@@ -10107,6 +10209,23 @@ static void BitmaskI32x4(MacroAssembler& masm, RegV128 rs, RegI32 rd,
+   masm.bitmaskInt32x4(rs, rd, temp);
+ }
+ 
++static void BitmaskI64x2(MacroAssembler& masm, RegV128 rs, RegI32 rd,
++                         RegV128 temp) {
++  masm.bitmaskInt64x2(rs, rd, temp);
++}
++#  elif defined(JS_CODEGEN_PPC64)
++static void BitmaskI8x16(MacroAssembler& masm, RegV128 rs, RegI32 rd,
++                         RegV128 temp) {
++  masm.bitmaskInt8x16(rs, rd, temp);
++}
++static void BitmaskI16x8(MacroAssembler& masm, RegV128 rs, RegI32 rd,
++                         RegV128 temp) {
++  masm.bitmaskInt16x8(rs, rd, temp);
++}
++static void BitmaskI32x4(MacroAssembler& masm, RegV128 rs, RegI32 rd,
++                         RegV128 temp) {
++  masm.bitmaskInt32x4(rs, rd, temp);
++}
+ static void BitmaskI64x2(MacroAssembler& masm, RegV128 rs, RegI32 rd,
+                          RegV128 temp) {
+   masm.bitmaskInt64x2(rs, rd, temp);
+@@ -10182,6 +10301,13 @@ static void BitselectV128(MacroAssembler& masm, RegV128 rhs, RegV128 control,
+   masm.bitwiseSelectSimd128(lhsDest, rhs, temp);
+   masm.moveSimd128(temp, lhsDest);
+ }
++#  elif defined(JS_CODEGEN_PPC64)
++static void BitselectV128(MacroAssembler& masm, RegV128 rhs, RegV128 control,
++                          RegV128 lhsDest, RegV128 temp) {
++  masm.moveSimd128(control, temp);
++  masm.bitwiseSelectSimd128(lhsDest, rhs, temp);
++  masm.moveSimd128(temp, lhsDest);
++}
+ #  endif
+ 
+ #  ifdef ENABLE_WASM_RELAXED_SIMD
+@@ -10257,7 +10383,7 @@ void BaseCompiler::emitDotI8x16I7x16AddS() {
+   RegV128 rsd = popV128();
+   RegV128 rs0, rs1;
+   pop2xV128(&rs0, &rs1);
+-#    if defined(JS_CODEGEN_ARM64)
++#    if defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_PPC64)
+   RegV128 temp = needV128();
+   masm.dotInt8x16Int7x16ThenAdd(rs0, rs1, rsd, temp);
+   freeV128(temp);
+@@ -10463,7 +10589,7 @@ bool BaseCompiler::emitVectorLaneSelect() {
+   freeV128(lhs);
+   freeV128(mask);
+   pushV128(rhsDest);
+-#    elif defined(JS_CODEGEN_ARM64)
++#    elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_PPC64)
+   RegV128 maskDest = popV128();
+   RegV128 rhs = popV128();
+   RegV128 lhs = popV128();
+@@ -12628,7 +12754,7 @@ bool js::wasm::BaselinePlatformSupport() {
+ #if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_X86) ||        \
+     defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64) ||      \
+     defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+-    defined(JS_CODEGEN_RISCV64)
++    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+   return true;
+ #else
+   return false;
+diff --git a/js/src/wasm/WasmCodegenConstants.h b/js/src/wasm/WasmCodegenConstants.h
+index 9c10d307ae6f..e25332b5464e 100644
+--- a/js/src/wasm/WasmCodegenConstants.h
++++ b/js/src/wasm/WasmCodegenConstants.h
+@@ -43,7 +43,8 @@ static const unsigned InterpFailInstanceReg = 0xbad;
+ // The following thresholds were derived from a microbenchmark. If we begin to
+ // ship this optimization for more platforms, we will need to extend this list.
+ 
+-#if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM64)
++#if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM64) || \
++    defined(JS_CODEGEN_PPC64)
+ static const uint32_t MaxInlineMemoryCopyLength = 64;
+ static const uint32_t MaxInlineMemoryFillLength = 64;
+ #elif defined(JS_CODEGEN_X86)
+diff --git a/js/src/wasm/WasmCodegenTypes.cpp b/js/src/wasm/WasmCodegenTypes.cpp
+index 8b9f32639ea3..e906c4afecc3 100644
+--- a/js/src/wasm/WasmCodegenTypes.cpp
++++ b/js/src/wasm/WasmCodegenTypes.cpp
+@@ -144,14 +144,15 @@ void TrapSitesForKind::checkInvariants(const uint8_t* codeBase) const {
+     last = pcOffset;
+   }
+ 
+-#  if (defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_X86) ||   \
+-       defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_ARM) || \
+-       defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_MIPS64))
++#  if (defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_X86) ||        \
++       defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_ARM) ||      \
++       defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_MIPS64) || \
++       defined(JS_CODEGEN_PPC64))
+   // Check that each trapsite is associated with a plausible instruction.  The
+   // required instruction kind depends on the trapsite kind.
+   //
+-  // NOTE: currently enabled on x86_{32,64}, arm{32,64}, loongson64 and mips64.
+-  // Ideally it should be extended to riscv64 too.
++  // NOTE: currently enabled on x86_{32,64}, arm{32,64}, loongson64, mips64,
++  // and ppc64. Ideally it should be extended to riscv64 too.
+   //
+   for (uint32_t i = 0; i < length(); i++) {
+     uint32_t pcOffset = pcOffsets_[i];
+diff --git a/js/src/wasm/WasmCompile.cpp b/js/src/wasm/WasmCompile.cpp
+index 051c60ebaa55..89447aa668ff 100644
+--- a/js/src/wasm/WasmCompile.cpp
++++ b/js/src/wasm/WasmCompile.cpp
+@@ -71,8 +71,9 @@ uint32_t wasm::ObservedCPUFeatures() {
+     ARM64 = 0x6,
+     LOONG64 = 0x7,
+     RISCV64 = 0x8,
++    PPC64 = 0x9,
+ 
+-    LAST = RISCV64,
++    LAST = PPC64,
+     ARCH_BITS = 4
+   };
+ 
+@@ -101,6 +102,9 @@ uint32_t wasm::ObservedCPUFeatures() {
+ #elif defined(JS_CODEGEN_RISCV64)
+   MOZ_ASSERT(jit::GetRISCV64Flags() <= (UINT32_MAX >> ARCH_BITS));
+   return RISCV64 | (jit::GetRISCV64Flags() << ARCH_BITS);
++#elif defined(JS_CODEGEN_PPC64)
++  MOZ_ASSERT(jit::GetPPC64Flags() <= (UINT32_MAX >> ARCH_BITS));
++  return PPC64 | (jit::GetPPC64Flags() << ARCH_BITS);
+ #elif defined(JS_CODEGEN_NONE) || defined(JS_CODEGEN_WASM32)
+   return 0;
+ #else
+diff --git a/js/src/wasm/WasmFrameIter.cpp b/js/src/wasm/WasmFrameIter.cpp
+index b3b264bc625a..b540acf9a05d 100644
+--- a/js/src/wasm/WasmFrameIter.cpp
++++ b/js/src/wasm/WasmFrameIter.cpp
+@@ -622,6 +622,19 @@ static const unsigned PushedFP = 16;
+ static const unsigned SetFP = 20;
+ static const unsigned PoppedFP = 4;
+ static const unsigned PoppedFPJitEntry = 8;
++#elif defined(JS_CODEGEN_PPC64)
++// pushReturnAddress = mflr(4) + stdu(4) = 8 bytes.
++// push(FP) = stdu(4) = 4 bytes (PPC64 stdu is a single DS-form instruction).
++// moveStackPtrTo = mr(4) = 4 bytes.
++static const unsigned PushedRetAddr = 8;
++static const unsigned PushedFP = 12;
++static const unsigned SetFP = 16;
++// Callable + jit-entry epilogues between poppedFP and *ret are:
++//   mtlr r0; addi sp, sp, 16  (two 4-byte instructions — 8 bytes).
++// mtlr must come before addi so LR holds the caller's RA throughout the
++// post-poppedFP window (single-step profiling fires every instruction).
++static const unsigned PoppedFP = 8;
++static const unsigned PoppedFPJitEntry = 8;
+ #elif defined(JS_CODEGEN_NONE) || defined(JS_CODEGEN_WASM32)
+ // Synthetic values to satisfy asserts and avoid compiler warnings.
+ static const unsigned PushedRetAddr = 0;
+@@ -710,6 +723,17 @@ static void GenerateCallablePrologue(MacroAssembler& masm, uint32_t* entry) {
+     masm.moveStackPtrTo(FramePointer);
+     MOZ_ASSERT_IF(!masm.oom(), SetFP == masm.currentOffset() - *entry);
+   }
++#elif defined(JS_CODEGEN_PPC64)
++  {
++    *entry = masm.currentOffset();
++
++    masm.pushReturnAddress();
++    MOZ_ASSERT_IF(!masm.oom(), PushedRetAddr == masm.currentOffset() - *entry);
++    masm.push(FramePointer);
++    MOZ_ASSERT_IF(!masm.oom(), PushedFP == masm.currentOffset() - *entry);
++    masm.moveStackPtrTo(FramePointer);
++    MOZ_ASSERT_IF(!masm.oom(), SetFP == masm.currentOffset() - *entry);
++  }
+ #elif defined(JS_CODEGEN_ARM64)
+   {
+     // We do not use the PseudoStackPointer.  However, we may be called in a
+@@ -803,6 +827,38 @@ static void GenerateCallableEpilogue(MacroAssembler& masm, unsigned framePushed,
+     masm.jalr(zero, ra, 0);
+     masm.nop();
+   }
++#elif defined(JS_CODEGEN_PPC64)
++  // Load RA and FP from the Frame while it's still on the stack.
++  // Using r0 (js::jit::r0) for RA is safe: it's volatile, used as
++  // RT (not base), and we're in an epilogue where it's not live.
++  masm.loadPtr(Address(StackPointer, Frame::returnAddressOffset()),
++               js::jit::r0);
++  masm.loadPtr(Address(StackPointer, Frame::callerFPOffset()), FramePointer);
++
++  // Fence the pool BEFORE capturing poppedFP. PoppedFP is a static 8
++  // (mtlr + addi); enterNoPool itself can emit insertNopFill() and a
++  // preemptive finishPool() at its top edge, so any leading insertions
++  // must land before poppedFP — not between poppedFP and *ret. A pool
++  // flush inside the 2-insn window would otherwise extend *ret - poppedFP
++  // and trip the post-condition assertion below. P9 routes FP constants
++  // through the pool so flushes are more frequent than on P8 (the
++  // assertion was historically silent on P8 but reproducible on P9 dbgopt).
++  masm.enterNoPool(2);
++  poppedFP = masm.currentOffset();
++
++  // Move RA into LR BEFORE popping the Frame. If the order were addi/mtlr,
++  // single-step profiling firing at the mtlr instruction would see: sp
++  // already moved (so saved RA at sp[8] is gone), addi already executed,
++  // and LR still holding the address right after the function's last `bl`
++  // (i.e. inside this function, not the caller's RA). With mtlr first,
++  // the entire post-poppedFP window has LR == caller's RA available
++  // either via sp[8] (pre-addi) or registers.lr (post-mtlr).
++  masm.xs_mtlr(js::jit::r0);
++  masm.addToStackPtr(Imm32(sizeof(Frame)));
++  *ret = masm.currentOffset();
++  masm.leaveNoPool();
++  masm.as_blr();
++
+ #elif defined(JS_CODEGEN_ARM64)
+ 
+   // See comment at equivalent place in |GenerateCallablePrologue| above.
+@@ -1483,6 +1539,9 @@ void wasm::GenerateJitEntryPrologue(MacroAssembler& masm,
+     BlockTrampolinePoolScope block_trampoline_pool(&masm, 10);
+     offsets->begin = masm.currentOffset();
+     masm.push(ra);
++#elif defined(JS_CODEGEN_PPC64)
++    offsets->begin = masm.currentOffset();
++    masm.pushReturnAddress();
+ #elif defined(JS_CODEGEN_ARM64)
+     {
+       AutoForbidPoolsAndNops afp(&masm,
+@@ -1536,6 +1595,20 @@ void wasm::GenerateJitEntryEpilogue(MacroAssembler& masm,
+     masm.Ret(ARMRegister(lr, 64));
+     masm.setFramePushed(0);
+   }
++#elif defined(JS_CODEGEN_PPC64)
++  // Load RA and FP from the frame while it's still on the stack, then
++  // restore LR, pop the frame, and return. mtlr must precede addi so LR
++  // holds the caller's RA across the whole post-poppedFP window (see
++  // GenerateCallableEpilogue for the matching rationale).
++  masm.loadPtr(Address(StackPointer, Frame::returnAddressOffset()),
++               js::jit::r0);
++  masm.loadPtr(Address(StackPointer, Frame::callerFPOffset()), FramePointer);
++  poppedFP = masm.currentOffset();
++
++  masm.xs_mtlr(js::jit::r0);
++  masm.addToStackPtr(Imm32(sizeof(Frame)));
++  offsets->ret = masm.currentOffset();
++  masm.as_blr();
+ #else
+   // Forbid pools for the same reason as described in GenerateCallablePrologue.
+ #  if defined(JS_CODEGEN_ARM)
+@@ -1905,6 +1978,22 @@ bool js::wasm::StartUnwinding(const RegisterState& registers,
+         fixedFP = fp;
+         AssertMatchesCallSite(fixedPC, fixedFP);
+       } else
++#elif defined(JS_CODEGEN_PPC64)
++      if (codeRange->isThunk()) {
++        // The FarJumpIsland sequence temporary scrambles the link register.
++        fixedPC = pc;
++        fixedFP = fp;
++        *unwoundCaller = false;
++        AssertMatchesCallSite(
++            Frame::fromUntaggedWasmExitFP(fp)->returnAddress(),
++            Frame::fromUntaggedWasmExitFP(fp)->rawCaller());
++      } else if (offsetFromEntry < PushedFP) {
++        // On PPC64 the return address is in LR (registers.lr) until
++        // pushReturnAddress() saves it to the stack.
++        fixedPC = (uint8_t*)registers.lr;
++        fixedFP = fp;
++        AssertMatchesCallSite(fixedPC, fixedFP);
++      } else
+ #elif defined(JS_CODEGEN_ARM64)
+       if (offsetFromEntry < SetFP || codeRange->isThunk()) {
+         // On ARM64 we rely on register state instead of state saved on
+@@ -1956,6 +2045,35 @@ bool js::wasm::StartUnwinding(const RegisterState& registers,
+         fixedPC = Frame::fromUntaggedWasmExitFP(sp)->returnAddress();
+         fixedFP = fp;
+         AssertMatchesCallSite(fixedPC, fixedFP);
++#elif defined(JS_CODEGEN_PPC64)
++      } else if (offsetInCode >= codeRange->ret() - PoppedFP &&
++                 offsetInCode < codeRange->ret()) {
++        // PPC64 epilogue (RA loaded into r0, FP restored, RA not yet
++        // moved to LR, SP not yet adjusted):
++        //   ld r0, 8(sp)      ; restore caller's RA into r0
++        //   ld FP, 0(sp)      ; restore caller's FP
++        //   <-- poppedFP -->
++        //   mtlr r0           ; LR := caller's RA
++        //   addi sp, sp, 16   ; pop the Frame
++        //   <-- ret -->
++        //   blr
++        // In the [poppedFP, ret) window the addi has not run, so *sp
++        // is still the saved Frame and sp[8] is the caller's RA.
++        // (registers.lr would also be correct after mtlr executes, but
++        // sp[8] is valid throughout this window — including before mtlr —
++        // so we read it consistently.)
++        MOZ_ASSERT(*sp == fp);
++        fixedPC = Frame::fromUntaggedWasmExitFP(sp)->returnAddress();
++        fixedFP = fp;
++        AssertMatchesCallSite(fixedPC, fixedFP);
++      } else if (offsetInCode == codeRange->ret()) {
++        // PPC64 epilogue, at the blr: addi has run, so SP is the
++        // caller's and *sp is unrelated memory. mtlr ran earlier in
++        // the [poppedFP, ret) window, so LR holds the caller's RA.
++        // fp holds the restored caller's FP.
++        fixedPC = (uint8_t*)registers.lr;
++        fixedFP = fp;
++        AssertMatchesCallSite(fixedPC, fixedFP);
+ #elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_LOONG64)
+         // The stack pointer does not move until all values have
+         // been restored so several cases can be coalesced here.
+diff --git a/js/src/wasm/WasmGC.cpp b/js/src/wasm/WasmGC.cpp
+index e59cd4f5aba0..21cd01fd1c5e 100644
+--- a/js/src/wasm/WasmGC.cpp
++++ b/js/src/wasm/WasmGC.cpp
+@@ -444,6 +444,14 @@ bool wasm::IsPlausibleStackMapKey(const uint8_t* nextPC) {
+             insn[-1] == 0x00000013 /* addi zero, zero, 0 */) ||  // jal; nop
+            (insn[-1] == 0x00100073 &&
+             (insn[-2] & kITypeMask) == RO_CSRRWI)));  // wasm trap
++#  elif defined(JS_CODEGEN_PPC64)
++  const uint32_t* insn = reinterpret_cast<const uint32_t*>(nextPC);
++  MOZ_ASSERT((uintptr_t(insn) & 3) == 0);
++  // xs_trap() = tw 31,r0,r0 (PPC_trap); bctrl = PPC_bctr|1; bl = I-form
++  // opcode 18 (PPC_b) with LK=1, AA=0, checked via 0xFC000003 mask.
++  return insn[-1] == uint32_t(PPC_trap) ||
++         insn[-1] == (uint32_t(PPC_bctr) | 1u) ||
++         (insn[-1] & 0xFC000003u) == (uint32_t(PPC_b) | 1u);
+ #  else
+   MOZ_CRASH("IsValidStackMapKey: requires implementation on this platform");
+ #  endif
+diff --git a/js/src/wasm/WasmGenerator.cpp b/js/src/wasm/WasmGenerator.cpp
+index 2dafac73e96a..07ffe150fc57 100644
+--- a/js/src/wasm/WasmGenerator.cpp
++++ b/js/src/wasm/WasmGenerator.cpp
+@@ -930,7 +930,23 @@ bool ModuleGenerator::finishCodeBlock(CodeBlockResult* result) {
+   callSiteTargets_.clear();
+   callFarJumps_.clear();
+ 
+-  // None of the linking or far-jump operations should emit masm metadata.
++  // None of the linking or far-jump operations should emit masm metadata,
++  // except on PPC64 where patchFarJump uses addLongJump to create CodeLabels
++  // for absolute-address far jumps. Drain those into linkData_ here.
++#ifdef JS_CODEGEN_PPC64
++  for (const jit::CodeLabel& codeLabel : masm_->codeLabels()) {
++    LinkData::InternalLink link;
++    link.patchAtOffset = codeLabel.patchAt().offset();
++    link.targetOffset = codeLabel.target().offset();
++#  ifdef JS_CODELABEL_LINKMODE
++    link.mode = codeLabel.linkMode();
++#  endif
++    if (!linkData_->internalLinks.append(link)) {
++      return false;
++    }
++  }
++  masm_->codeLabels().clear();
++#endif
+ 
+   MOZ_ASSERT(masm_->inliningContext().empty());
+   MOZ_ASSERT(masm_->callSites().empty());
+diff --git a/js/src/wasm/WasmIonCompile.cpp b/js/src/wasm/WasmIonCompile.cpp
+index 9c79b9cf0704..0d0e661770af 100644
+--- a/js/src/wasm/WasmIonCompile.cpp
++++ b/js/src/wasm/WasmIonCompile.cpp
+@@ -11602,7 +11602,7 @@ bool js::wasm::IonPlatformSupport() {
+ #if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_X86) ||       \
+     defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) ||    \
+     defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_LOONG64) || \
+-    defined(JS_CODEGEN_RISCV64)
++    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+   return true;
+ #else
+   return false;
+diff --git a/js/src/wasm/WasmMemory.cpp b/js/src/wasm/WasmMemory.cpp
+index 0e3e6d3509ad..feee9f6ea1c9 100644
+--- a/js/src/wasm/WasmMemory.cpp
++++ b/js/src/wasm/WasmMemory.cpp
+@@ -288,9 +288,9 @@ static_assert(MaxMemoryAccessSize <= HugeUnalignedGuardPage,
+ static_assert(HugeOffsetGuardLimit < UINT32_MAX,
+               "checking for overflow against OffsetGuardLimit is enough.");
+ 
+-// We have only tested huge memory on x64, arm64 and riscv64.
++// We have only tested huge memory on x64, arm64, riscv64 and ppc64.
+ #  if !(defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM64) || \
+-        defined(JS_CODEGEN_RISCV64))
++        defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64))
+ #    error "Not an expected configuration"
+ #  endif
+ 
+diff --git a/js/src/wasm/WasmSignalHandlers.cpp b/js/src/wasm/WasmSignalHandlers.cpp
+index cc8bc2755745..84d3c4ec164d 100644
+--- a/js/src/wasm/WasmSignalHandlers.cpp
++++ b/js/src/wasm/WasmSignalHandlers.cpp
+@@ -111,7 +111,9 @@ using namespace js::wasm;
+ #    if defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || \
+         defined(__PPC64LE__)
+ #      define R01_sig(p) ((p)->sc_frame.fixreg[1])
++#      define R31_sig(p) ((p)->sc_frame.fixreg[31])
+ #      define R32_sig(p) ((p)->sc_frame.srr0)
++#      define R36_sig(p) ((p)->sc_frame.lr)
+ #    endif
+ #  elif defined(__linux__) || defined(__sun)
+ #    if defined(__linux__)
+@@ -157,7 +159,9 @@ using namespace js::wasm;
+ #    if defined(__linux__) && (defined(__ppc64__) || defined(__PPC64__) || \
+                                defined(__ppc64le__) || defined(__PPC64LE__))
+ #      define R01_sig(p) ((p)->uc_mcontext.gp_regs[1])
++#      define R31_sig(p) ((p)->uc_mcontext.gp_regs[31])
+ #      define R32_sig(p) ((p)->uc_mcontext.gp_regs[32])
++#      define R36_sig(p) ((p)->uc_mcontext.gp_regs[36])
+ #    endif
+ #    if defined(__linux__) && defined(__loongarch__)
+ #      define EPC_sig(p) ((p)->uc_mcontext.__pc)
+@@ -200,7 +204,9 @@ using namespace js::wasm;
+ #    if defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || \
+         defined(__PPC64LE__)
+ #      define R01_sig(p) ((p)->uc_mcontext.__gregs[_REG_R1])
++#      define R31_sig(p) ((p)->uc_mcontext.__gregs[_REG_R31])
+ #      define R32_sig(p) ((p)->uc_mcontext.__gregs[_REG_PC])
++#      define R36_sig(p) ((p)->uc_mcontext.__gregs[_REG_LR])
+ #    endif
+ #  elif defined(__DragonFly__) || defined(__FreeBSD__) || \
+       defined(__FreeBSD_kernel__)
+@@ -234,7 +240,9 @@ using namespace js::wasm;
+ #    if defined(__FreeBSD__) && (defined(__ppc64__) || defined(__PPC64__) || \
+                                  defined(__ppc64le__) || defined(__PPC64LE__))
+ #      define R01_sig(p) ((p)->uc_mcontext.mc_gpr[1])
++#      define R31_sig(p) ((p)->uc_mcontext.mc_gpr[31])
+ #      define R32_sig(p) ((p)->uc_mcontext.mc_srr0)
++#      define R36_sig(p) ((p)->uc_mcontext.mc_lr)
+ #    endif
+ #  elif defined(XP_DARWIN)
+ #    define EIP_sig(p) ((p)->thread.uts.ts32.__eip)
+@@ -412,7 +420,8 @@ struct macos_aarch64_context {
+       defined(__PPC64LE__)
+ #    define PC_sig(p) R32_sig(p)
+ #    define SP_sig(p) R01_sig(p)
+-#    define FP_sig(p) R01_sig(p)
++#    define FP_sig(p) R31_sig(p)
++#    define LR_sig(p) R36_sig(p)
+ #  elif defined(__loongarch__)
+ #    define PC_sig(p) EPC_sig(p)
+ #    define FP_sig(p) RFP_sig(p)
+@@ -458,7 +467,8 @@ static uint8_t* ContextToSP(CONTEXT* context) {
+ }
+ 
+ #  if defined(__arm__) || defined(__aarch64__) || defined(__mips__) || \
+-      defined(__loongarch__) || defined(__riscv)
++      defined(__loongarch__) || defined(__riscv) || \
++      defined(__ppc64__) || defined(__PPC64__)
+ static uint8_t* ContextToLR(CONTEXT* context) {
+ #    ifdef LR_sig
+   return mozilla::BitwiseCast<uint8_t*>(LR_sig(context));
+@@ -475,7 +485,8 @@ static JS::ProfilingFrameIterator::RegisterState ToRegisterState(
+   state.pc = ContextToPC(context);
+   state.sp = ContextToSP(context);
+ #  if defined(__arm__) || defined(__aarch64__) || defined(__mips__) || \
+-      defined(__loongarch__) || defined(__riscv)
++      defined(__loongarch__) || defined(__riscv) || \
++      defined(__ppc64__) || defined(__PPC64__)
+   state.lr = ContextToLR(context);
+ #  else
+   state.lr = (void*)UINTPTR_MAX;
+@@ -776,6 +787,9 @@ static void MachExceptionHandlerThread() {
+ 
+ #    if defined(__mips__) || defined(__loongarch__)
+ static const uint32_t kWasmTrapSignal = SIGFPE;
++#    elif defined(__ppc64__) || defined(__PPC64__) || \ 
++          defined(__ppc64le__) || defined(__PPC64LE__)
++static const uint32_t kWasmTrapSignal = SIGTRAP;
+ #    else
+ static const uint32_t kWasmTrapSignal = SIGILL;
+ #    endif
+diff --git a/js/src/wasm/WasmStacks.cpp b/js/src/wasm/WasmStacks.cpp
+index 71497353c5c1..6514d8b0e2e4 100644
+--- a/js/src/wasm/WasmStacks.cpp
++++ b/js/src/wasm/WasmStacks.cpp
+@@ -426,6 +426,30 @@ static constexpr size_t ContStackMaxJitStackSize = 10 * 1024 * 1024;
+ // or stack snapshots utilities.
+ static constexpr size_t ContStackRedZoneSize = 0x8000;
+ 
++// Effective red-zone size used when laying out a continuation stack.
++//
++// The jit stack (and therefore the bottom guard page) must start on a page
++// boundary; otherwise gc::ProtectPages trips MOZ_RELEASE_ASSERT(length %
++// pageSize == 0). The red zone sits between the top guard page and the jit
++// stack, so its size has to be a page multiple to keep that start aligned.
++//
++// Rounding the red zone up to a page is correct on every platform and would
++// also cover any configuration whose page size exceeds ContStackRedZoneSize
++// (32K) -- e.g. a 64K-page AArch64 kernel -- but ContStackRedZoneSize is
++// already a multiple of the 4K/16K pages used on the tier-1 platforms, so the
++// round-up is a no-op there today. We deliberately gate it to PPC64 (64K
++// pages, where the round-up is load-bearing) so this patch cannot alter
++// continuation stack layout on any tier-1 platform. Drop the gate if the
++// general case is ever wanted.
++static inline size_t ContStackEffectiveRedZoneSize(
++    [[maybe_unused]] size_t pageSize) {
++#ifdef JS_CODEGEN_PPC64
++  return RoundUp(ContStackRedZoneSize, pageSize);
++#else
++  return ContStackRedZoneSize;
++#endif
++}
++
+ // Number of guard pages at the top and bottom of each continuation stack slot.
+ static constexpr size_t ContStackTopGuardPages = 1;
+ static constexpr size_t ContStackBottomGuardPages = 1;
+@@ -444,8 +468,8 @@ void ContStackSize::compute() {
+                          ContStackMinJitStackSize, ContStackMaxJitStackSize),
+               pageSize);
+   headerSize = RoundUp(sizeof(ContStack), pageSize);
+-  totalSize = topGuardSize + ContStackRedZoneSize + jitStackSize +
+-              bottomGuardSize + headerSize;
++  totalSize = topGuardSize + ContStackEffectiveRedZoneSize(pageSize) +
++              jitStackSize + bottomGuardSize + headerSize;
+ 
+   // Assert we can't overflow when multiplying our size by capacity. Assume
+   // 32-bit integers to be conservative.
+@@ -467,7 +491,8 @@ void ContStack::init(ContStackArena* arena, uintptr_t allocationBase,
+   uintptr_t topGuardPagePhysicalStart = allocationBase;
+   uintptr_t topGuardPagePhysicalEnd = allocationBase + topGuardPageSize;
+   uintptr_t redZonePhysicalStart = topGuardPagePhysicalEnd;
+-  uintptr_t jitStackPhysicalStart = redZonePhysicalStart + ContStackRedZoneSize;
++  uintptr_t jitStackPhysicalStart =
++      redZonePhysicalStart + ContStackEffectiveRedZoneSize(pageSize);
+   uintptr_t jitStackPhysicalEnd = jitStackPhysicalStart + jitStackSize;
+   uintptr_t bottomGuardPagePhysicalStart = jitStackPhysicalEnd;
+   uintptr_t headerPhysicalStart =
+diff --git a/js/src/wasm/WasmStubs.cpp b/js/src/wasm/WasmStubs.cpp
+index 8a98e201a452..8497814fcd37 100644
+--- a/js/src/wasm/WasmStubs.cpp
++++ b/js/src/wasm/WasmStubs.cpp
+@@ -646,8 +646,9 @@ static bool GenerateInterpEntry(MacroAssembler& masm, const FuncExport& fe,
+ 
+   // Save the return address if it wasn't already saved by the call insn.
+ #ifdef JS_USE_LINK_REGISTER
+-#  if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || \
+-      defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#  if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) ||      \
++      defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || \
++      defined(JS_CODEGEN_PPC64)
+   masm.pushReturnAddress();
+ #  elif defined(JS_CODEGEN_ARM64)
+   // WasmPush updates framePushed() unlike pushReturnAddress(), but that's
+@@ -2123,9 +2124,10 @@ static bool GenerateImportInterpExit(MacroAssembler& masm, const FuncImport& fi,
+   // The native ABI preserves the instance, heap and global registers since they
+   // are non-volatile.
+   MOZ_ASSERT(NonVolatileRegs.has(InstanceReg));
+-#if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM) ||      \
+-    defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
+-    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM) ||         \
++    defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) ||    \
++    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || \
++    defined(JS_CODEGEN_PPC64)
+   MOZ_ASSERT(NonVolatileRegs.has(HeapReg));
+ #endif
+ 
+@@ -2571,6 +2573,15 @@ bool wasm::GenerateBuiltinThunk(MacroAssembler& masm, ABIFunctionType abiType,
+                         Register::FromCode(regId + 1));
+         }
+       }
++#endif
++#ifdef JS_CODEGEN_PPC64
++      // PPC64 32-bit operations do not zero-extend to 64 bits (unlike
++      // x86-64/ARM64/LA64). The ELFv2 ABI requires callers to zero/sign-extend
++      // narrow args. Wasm i32 values may have garbage upper bits in 64-bit
++      // registers, so zero-extend them before calling C++ builtins.
++      if (selfArgs.mirType() == MIRType::Int32) {
++        masm.move32ZeroExtendToPtr(selfArgs->gpr(), selfArgs->gpr());
++      }
+ #endif
+       continue;
+     }
+@@ -2659,6 +2670,28 @@ static const LiveRegisterSet RegsToPreserve(
+ #  ifdef ENABLE_WASM_SIMD
+ #    error "high lanes of SIMD registers need to be saved too."
+ #  endif
++#elif defined(JS_CODEGEN_PPC64)
++// Exclude r0 (ScratchRegister, not allocatable, special addressing semantics),
++// r1 (SP), r2 (TOC pointer, reserved), and r13 (TLS pointer, reserved).
++static const LiveRegisterSet RegsToPreserve(
++    GeneralRegisterSet(Registers::AllMask & ~((uint32_t(1) << Registers::r0) |
++                                              (uint32_t(1) << Registers::r1) |
++                                              (uint32_t(1) << Registers::r2) |
++                                              (uint32_t(1) << Registers::r13))),
++#  ifdef ENABLE_WASM_SIMD
++    // Unlike ARM64, where the vector registers alias the doubles, PPC64
++    // doubles live in the FPRs (VSR0-31) while wasm v128 values live in the
++    // VRs (VSR32-63) -- two disjoint physical pools, so both must be
++    // preserved. Saving only the doubles loses the entire live v128 state: a
++    // trap firing while a v128 is live (notably the interrupt-check trap,
++    // which fires constantly in hot loops) resumes with whatever the C++
++    // handler's libc left in the VRs (e.g. glibc's vector memcpy leaves lvsl
++    // alignment-control patterns in low VRs).
++    FloatRegisterSet(FloatRegisters::AllDoubleMask |
++                     FloatRegisters::AllSimd128Mask));
++#  else
++    FloatRegisterSet(FloatRegisters::AllDoubleMask));
++#  endif
+ #elif defined(JS_CODEGEN_ARM64)
+ // We assume that traps do not happen while lr is live. This both ensures that
+ // the size of RegsToPreserve is a multiple of 2 (preserving WasmStackAlignment)
+diff --git a/js/src/wasm/WasmSummarizeInsn.cpp b/js/src/wasm/WasmSummarizeInsn.cpp
+index 7bb4f4b7a725..2ae55a1b1b9e 100644
+--- a/js/src/wasm/WasmSummarizeInsn.cpp
++++ b/js/src/wasm/WasmSummarizeInsn.cpp
+@@ -1731,6 +1731,169 @@ Maybe<TrapMachineInsn> SummarizeTrapInstruction(const uint8_t* insnAddr) {
+   return Nothing();
+ }
+ 
++// ================================================================== ppc64 ====
++
++#  elif defined(JS_CODEGEN_PPC64)
++
++Maybe<TrapMachineInsn> SummarizeTrapInstruction(const uint8_t* insnAddr) {
++  MOZ_ASSERT(0 == (uintptr_t(insnAddr) & 3));
++
++  const uint32_t insn = *(uint32_t*)insnAddr;
++  const uint32_t majorOp = insn >> 26;
++  // X-form secondary opcode: bits 10..1.
++  const uint32_t xo = (insn >> 1) & 0x3FF;
++
++  // PPC_trap = 0x7FE00008 = tw 31,0,0.
++  if (insn == 0x7FE00008) {
++    return Some(TrapMachineInsn::OfficialUD);
++  }
++
++  // D-form / DS-form loads.
++  switch (majorOp) {
++    case 34:  // lbz
++      return Some(TrapMachineInsn::Load8);
++    case 40:  // lhz
++    case 42:  // lha
++      return Some(TrapMachineInsn::Load16);
++    case 32:  // lwz
++      return Some(TrapMachineInsn::Load32);
++    case 58:  // ld (DS=0) / lwa (DS=2)
++      if ((insn & 3) == 2) {
++        return Some(TrapMachineInsn::Load32);  // lwa
++      }
++      return Some(TrapMachineInsn::Load64);  // ld
++    case 48:                                 // lfs
++      return Some(TrapMachineInsn::Load32);
++    case 50:  // lfd
++      return Some(TrapMachineInsn::Load64);
++    default:
++      break;
++  }
++
++  // D-form / DS-form stores.
++  switch (majorOp) {
++    case 38:  // stb
++      return Some(TrapMachineInsn::Store8);
++    case 44:  // sth
++      return Some(TrapMachineInsn::Store16);
++    case 36:  // stw
++    case 37:  // stwu
++      return Some(TrapMachineInsn::Store32);
++    case 52:  // stfs
++      return Some(TrapMachineInsn::Store32);
++    case 62:  // std (DS=0) / stdu (DS=1)
++      return Some(TrapMachineInsn::Store64);
++    case 54:  // stfd
++    case 55:  // stfdu
++      return Some(TrapMachineInsn::Store64);
++    default:
++      break;
++  }
++
++  // X-form instructions (major opcode 31).
++  if (majorOp == 31) {
++    switch (xo) {
++      // Indexed loads.
++      case 87:  // lbzx
++        return Some(TrapMachineInsn::Load8);
++      case 279:  // lhzx
++      case 343:  // lhax
++        return Some(TrapMachineInsn::Load16);
++      case 23:  // lwzx
++        return Some(TrapMachineInsn::Load32);
++      case 21:  // ldx
++        return Some(TrapMachineInsn::Load64);
++      case 535:  // lfsx
++      case 855:  // lfiwax
++      case 887:  // lfiwzx
++        return Some(TrapMachineInsn::Load32);
++      case 599:  // lfdx
++        return Some(TrapMachineInsn::Load64);
++      case 790:  // lhbrx (byte-reverse halfword)
++        return Some(TrapMachineInsn::Load16);
++      case 534:  // lwbrx (byte-reverse word)
++        return Some(TrapMachineInsn::Load32);
++
++      // Indexed stores.
++      case 215:  // stbx
++        return Some(TrapMachineInsn::Store8);
++      case 407:  // sthx
++        return Some(TrapMachineInsn::Store16);
++      case 151:  // stwx
++        return Some(TrapMachineInsn::Store32);
++      case 149:  // stdx
++        return Some(TrapMachineInsn::Store64);
++      case 663:  // stfsx
++        return Some(TrapMachineInsn::Store32);
++      case 727:  // stfdx
++        return Some(TrapMachineInsn::Store64);
++      case 918:  // sthbrx (byte-reverse halfword store)
++        return Some(TrapMachineInsn::Store16);
++      case 662:  // stwbrx (byte-reverse word store)
++        return Some(TrapMachineInsn::Store32);
++
++      // VSX SIMD indexed load/store (XX1-form, same major opcode 31).
++      case 268:  // lxvx (POWER9)
++      case 844:  // lxvd2x (POWER8)
++        return Some(TrapMachineInsn::Load128);
++      case 396:  // stxvx (POWER9)
++      case 972:  // stxvd2x (POWER8)
++        return Some(TrapMachineInsn::Store128);
++
++      // Atomic (load-reserve / store-conditional).
++      case 20:   // lwarx
++      case 52:   // lbarx (POWER7+)
++      case 84:   // ldarx
++      case 116:  // lharx (POWER7+)
++        return Some(TrapMachineInsn::Atomic);
++      default:
++        break;
++    }
++    // stwcx. (XO=150, Rc=1), stdcx. (XO=214, Rc=1), stbcx. (XO=694, Rc=1)
++    // and sthcx. (XO=726, Rc=1) have bit 0 set. Note xo above already
++    // discards bit 0, so we need a separate low-11-bit match.
++    const uint32_t xoRc = insn & 0x7FF;  // bits 10..0
++    if (xoRc == ((150 << 1) | 1) || xoRc == ((214 << 1) | 1) ||
++        xoRc == ((694 << 1) | 1) || xoRc == ((726 << 1) | 1)) {
++      return Some(TrapMachineInsn::Atomic);
++    }
++  }
++
++  // POWER10 prefixed loads/stores (major opcode 1). The trap-site PC
++  // points at the prefix word; the actual load/store kind is encoded in
++  // the suffix word at insnAddr + 4. The 64-byte-boundary rule
++  // (ensurePrefixedAlignment) guarantees the suffix is in the same block.
++  if (majorOp == 1) {
++    const uint32_t suffix = *(uint32_t*)(insnAddr + 4);
++    const uint32_t suffixOp6 = suffix >> 26;          // 6-bit suffix op
++    const uint32_t suffixOp5 = suffix >> 27;          // 5-bit suffix op (plxv/pstxv)
++    switch (suffixOp6) {
++      case 57:  // pld
++        return Some(TrapMachineInsn::Load64);
++      case 50:  // plfd
++        return Some(TrapMachineInsn::Load64);
++      case 48:  // plfs
++        return Some(TrapMachineInsn::Load32);
++      case 61:  // pstd
++        return Some(TrapMachineInsn::Store64);
++      case 54:  // pstfd
++        return Some(TrapMachineInsn::Store64);
++      case 52:  // pstfs
++        return Some(TrapMachineInsn::Store32);
++      default:
++        break;
++    }
++    if (suffixOp5 == 25) {  // plxv
++      return Some(TrapMachineInsn::Load128);
++    }
++    if (suffixOp5 == 27) {  // pstxv
++      return Some(TrapMachineInsn::Store128);
++    }
++  }
++
++  return Nothing();
++}
++
+ // ================================================================== none ====
+ 
+ #  elif defined(JS_CODEGEN_NONE)
+diff --git a/js/src/wasm/WasmValue.cpp b/js/src/wasm/WasmValue.cpp
+index fda0996851e1..45fff24fa582 100644
+--- a/js/src/wasm/WasmValue.cpp
++++ b/js/src/wasm/WasmValue.cpp
+@@ -430,7 +430,7 @@ bool ToWebAssemblyValue_i32(JSContext* cx, HandleValue val, int32_t* loc,
+   bool ok = ToInt32(cx, val, loc);
+   if (ok && mustWrite64) {
+ #if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+-    defined(JS_CODEGEN_RISCV64)
++    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+     loc[1] = loc[0] >> 31;
+ #else
+     loc[1] = 0;
+diff --git a/mfbt/Assertions.h b/mfbt/Assertions.h
+index a436d019a197..4887af7e7676 100644
+--- a/mfbt/Assertions.h
++++ b/mfbt/Assertions.h
+@@ -282,6 +282,11 @@ static inline void MOZ_CrashSequence(void* aAddress, intptr_t aLine) {
+       "st.d %1,%0,0;\n"  // Write the line number to the crashing address
+       :                  // no output registers
+       : "r"(aAddress), "r"(aLine));
++#  elif defined(__powerpc64__)
++  asm volatile(
++      "std %1,0(%0);\n"  // Write the line number to the crashing address
++      :                  // no output registers
++      : "r"(aAddress), "r"(aLine));
+ #  else
+ #    warning \
+         "Unsupported architecture, replace the code below with assembly suitable to crash the process"
+-- 
+2.52.0
+

diff --git a/firefox.spec b/firefox.spec
index 06a6900..be8abaf 100644
--- a/firefox.spec
+++ b/firefox.spec
@@ -281,6 +281,11 @@ Patch600:        pgo.patch
 Patch602:        mozilla-1516803.patch
 Patch603:        firefox-gcc-always-inline.patch
 
+# ppc64le JIT
+Patch900:        0001-Add-VSX-instructions-for-SKIA.patch
+Patch901:        0002-Add-VSX-instructions-for-libwebp.patch
+Patch902:        0003-Add-PPC64LE-JIT-backend.patch
+
 
 %if %{?system_nss}
 BuildRequires:  pkgconfig(nspr) >= %{nspr_version}
@@ -601,6 +606,11 @@ cat %{SOURCE49} | sed -e "s|LIBCLANG_RT_PLACEHOLDER|`pwd`/wasi-sdk-30/build/sysr
 %endif
 %patch -P603 -p1 -b .inline
 
+# ppc64le JIT
+%patch -P900 -p1
+%patch -P901 -p1
+%patch -P902 -p1
+
 rm -f .mozconfig
 cp %{SOURCE10} .mozconfig
 echo "ac_add_options --enable-default-toolkit=cairo-gtk3-wayland" >> .mozconfig

                 reply	other threads:[~2026-06-16 13:10 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=178161545850.1.13214606391308119450.rpms-firefox-e99f0d4925ac@fedoraproject.org \
    --to=git-commits@fedoraproject.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox