From 848037cadd63091d5c39d06cd5d49aeee2258b4e Mon Sep 17 00:00:00 2001
From: Kevin Newton <kddnewton@gmail.com>
Date: Fri, 9 Sep 2022 11:37:41 -0400
Subject: [PATCH] Better offsets (#6315)

* Introduce InstructionOffset for AArch64

There are a lot of instructions on AArch64 where we take an offset
from PC in terms of the number of instructions. This is for loading
a value relative to the PC or for jumping.

We were usually accepting an A64Opnd or an i32. It can get
confusing and inconsistent though because sometimes you would
divide by 4 to get the number of instructions or multiply by 4 to
get the number of bytes.

This commit adds a struct that wraps an i32 in order to keep all of
that logic in one place. It makes it much easier to read and reason
about how these offsets are getting used.

* Use b instruction when the offset fits on AArch64
---
 yjit/src/asm/arm64/arg/inst_offset.rs   | 47 ++++++++++++++++++
 yjit/src/asm/arm64/arg/mod.rs           |  2 +
 yjit/src/asm/arm64/inst/branch_cond.rs  | 18 +++----
 yjit/src/asm/arm64/inst/call.rs         | 28 +++++------
 yjit/src/asm/arm64/inst/load_literal.rs | 14 +++---
 yjit/src/asm/arm64/mod.rs               | 63 ++++++++++---------------
 yjit/src/backend/arm64/mod.rs           | 52 ++++++++++----------
 7 files changed, 133 insertions(+), 91 deletions(-)
 create mode 100644 yjit/src/asm/arm64/arg/inst_offset.rs
diff --git a/yjit/src/asm/arm64/arg/inst_offset.rs b/yjit/src/asm/arm64/arg/inst_offset.rs
new file mode 100644
index 0000000000..f4a6bc73a0
--- /dev/null
+++ b/yjit/src/asm/arm64/arg/inst_offset.rs
@@ -0,0 +1,47 @@
+/// There are a lot of instructions in the AArch64 architectrue that take an
+/// offset in terms of number of instructions. Usually they are jump
+/// instructions or instructions that load a value relative to the current PC.
+///
+/// This struct is used to mark those locations instead of a generic operand in
+/// order to give better clarity to the developer when reading the AArch64
+/// backend code. It also helps to clarify that everything is in terms of a
+/// number of instructions and not a number of bytes (i.e., the offset is the
+/// number of bytes divided by 4).
+#[derive(Copy, Clone)]
+pub struct InstructionOffset(i32);
+
+impl InstructionOffset {
+    /// Create a new instruction offset.
+    pub fn from_insns(insns: i32) -> Self {
+        InstructionOffset(insns)
+    }
+
+    /// Create a new instruction offset from a number of bytes.
+    pub fn from_bytes(bytes: i32) -> Self {
+        assert_eq!(bytes % 4, 0, "Byte offset must be a multiple of 4");
+        InstructionOffset(bytes / 4)
+    }
+}
+
+impl From<i32> for InstructionOffset {
+    /// Convert an i64 into an instruction offset.
+    fn from(value: i32) -> Self {
+        InstructionOffset(value)
+    }
+}
+
+impl From<InstructionOffset> for i32 {
+    /// Convert an instruction offset into a number of instructions as an i32.
+    fn from(offset: InstructionOffset) -> Self {
+        offset.0
+    }
+}
+
+impl From<InstructionOffset> for i64 {
+    /// Convert an instruction offset into a number of instructions as an i64.
+    /// This is useful for when we're checking how many bits this offset fits
+    /// into.
+    fn from(offset: InstructionOffset) -> Self {
+        offset.0.into()
+    }
+}
diff --git a/yjit/src/asm/arm64/arg/mod.rs b/yjit/src/asm/arm64/arg/mod.rs
index 9bf4a8ea13..7eb37834f9 100644
--- a/yjit/src/asm/arm64/arg/mod.rs
+++ b/yjit/src/asm/arm64/arg/mod.rs
@@ -3,6 +3,7 @@
 
 mod bitmask_imm;
 mod condition;
+mod inst_offset;
 mod sf;
 mod shifted_imm;
 mod sys_reg;
@@ -10,6 +11,7 @@ mod truncate;
 
 pub use bitmask_imm::BitmaskImmediate;
 pub use condition::Condition;
+pub use inst_offset::InstructionOffset;
 pub use sf::Sf;
 pub use shifted_imm::ShiftedImmediate;
 pub use sys_reg::SystemRegister;
diff --git a/yjit/src/asm/arm64/inst/branch_cond.rs b/yjit/src/asm/arm64/inst/branch_cond.rs
index c489bacef0..4338cf0f4f 100644
--- a/yjit/src/asm/arm64/inst/branch_cond.rs
+++ b/yjit/src/asm/arm64/inst/branch_cond.rs
@@ -1,4 +1,4 @@
-use super::super::arg::{Condition, truncate_imm};
+use super::super::arg::{Condition, InstructionOffset, truncate_imm};
 
 /// The struct that represents an A64 conditional branch instruction that can be
 /// encoded.
@@ -14,14 +14,14 @@ pub struct BranchCond {
     cond: u8,
 
     /// The instruction offset from this instruction to branch to.
-    imm19: i32
+    offset: InstructionOffset
 }
 
 impl BranchCond {
     /// B.cond
     /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/B-cond--Branch-conditionally-
-    pub fn bcond(cond: u8, imm19: i32) -> Self {
-        Self { cond, imm19 }
+    pub fn bcond(cond: u8, offset: InstructionOffset) -> Self {
+        Self { cond, offset }
     }
 }
 
@@ -34,7 +34,7 @@ impl From<BranchCond> for u32 {
         0
         | (1 << 30)
         | (FAMILY << 26)
-        | (truncate_imm::<_, 19>(inst.imm19) << 5)
+        | (truncate_imm::<_, 19>(inst.offset) << 5)
         | (inst.cond as u32)
     }
 }
@@ -53,25 +53,25 @@ mod tests {
 
     #[test]
     fn test_b_eq() {
-        let result: u32 = BranchCond::bcond(Condition::EQ, 32).into();
+        let result: u32 = BranchCond::bcond(Condition::EQ, 32.into()).into();
         assert_eq!(0x54000400, result);
     }
 
     #[test]
     fn test_b_vs() {
-        let result: u32 = BranchCond::bcond(Condition::VS, 32).into();
+        let result: u32 = BranchCond::bcond(Condition::VS, 32.into()).into();
         assert_eq!(0x54000406, result);
     }
 
     #[test]
     fn test_b_eq_max() {
-        let result: u32 = BranchCond::bcond(Condition::EQ, (1 << 18) - 1).into();
+        let result: u32 = BranchCond::bcond(Condition::EQ, ((1 << 18) - 1).into()).into();
         assert_eq!(0x547fffe0, result);
     }
 
     #[test]
     fn test_b_eq_min() {
-        let result: u32 = BranchCond::bcond(Condition::EQ, -(1 << 18)).into();
+        let result: u32 = BranchCond::bcond(Condition::EQ, (-(1 << 18)).into()).into();
         assert_eq!(0x54800000, result);
     }
 }
diff --git a/yjit/src/asm/arm64/inst/call.rs b/yjit/src/asm/arm64/inst/call.rs
index 32d924f799..74debac7f7 100644
--- a/yjit/src/asm/arm64/inst/call.rs
+++ b/yjit/src/asm/arm64/inst/call.rs
@@ -1,4 +1,4 @@
-use super::super::arg::truncate_imm;
+use super::super::arg::{InstructionOffset, truncate_imm};
 
 /// The operation to perform for this instruction.
 enum Op {
@@ -20,8 +20,8 @@ enum Op {
 /// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
 ///
 pub struct Call {
-    /// The PC-relative offset to jump to (which will be multiplied by 4).
-    imm26: i32,
+    /// The PC-relative offset to jump to in terms of number of instructions.
+    offset: InstructionOffset,
 
     /// The operation to perform for this instruction.
     op: Op
@@ -30,14 +30,14 @@ pub struct Call {
 impl Call {
     /// B
     /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B--Branch-
-    pub fn b(imm26: i32) -> Self {
-        Self { imm26, op: Op::Branch }
+    pub fn b(offset: InstructionOffset) -> Self {
+        Self { offset, op: Op::Branch }
     }
 
     /// BL
     /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/BL--Branch-with-Link-?lang=en
-    pub fn bl(imm26: i32) -> Self {
-        Self { imm26, op: Op::BranchWithLink }
+    pub fn bl(offset: InstructionOffset) -> Self {
+        Self { offset, op: Op::BranchWithLink }
     }
 }
 
@@ -50,7 +50,7 @@ impl From<Call> for u32 {
         0
         | ((inst.op as u32) << 31)
         | (FAMILY << 26)
-        | truncate_imm::<_, 26>(inst.imm26)
+        | truncate_imm::<_, 26>(inst.offset)
     }
 }
 
@@ -68,37 +68,37 @@ mod tests {
 
     #[test]
     fn test_bl() {
-        let result: u32 = Call::bl(0).into();
+        let result: u32 = Call::bl(0.into()).into();
         assert_eq!(0x94000000, result);
     }
 
     #[test]
     fn test_bl_positive() {
-        let result: u32 = Call::bl(256).into();
+        let result: u32 = Call::bl(256.into()).into();
         assert_eq!(0x94000100, result);
     }
 
     #[test]
     fn test_bl_negative() {
-        let result: u32 = Call::bl(-256).into();
+        let result: u32 = Call::bl((-256).into()).into();
         assert_eq!(0x97ffff00, result);
     }
 
     #[test]
     fn test_b() {
-        let result: u32 = Call::b(0).into();
+        let result: u32 = Call::b(0.into()).into();
         assert_eq!(0x14000000, result);
     }
 
     #[test]
     fn test_b_positive() {
-        let result: u32 = Call::b((1 << 25) - 1).into();
+        let result: u32 = Call::b(((1 << 25) - 1).into()).into();
         assert_eq!(0x15ffffff, result);
     }
 
     #[test]
     fn test_b_negative() {
-        let result: u32 = Call::b(-(1 << 25)).into();
+        let result: u32 = Call::b((-(1 << 25)).into()).into();
         assert_eq!(0x16000000, result);
     }
 }
diff --git a/yjit/src/asm/arm64/inst/load_literal.rs b/yjit/src/asm/arm64/inst/load_literal.rs
index c5ab09713c..3eade205c8 100644
--- a/yjit/src/asm/arm64/inst/load_literal.rs
+++ b/yjit/src/asm/arm64/inst/load_literal.rs
@@ -1,4 +1,4 @@
-use super::super::arg::truncate_imm;
+use super::super::arg::{InstructionOffset, truncate_imm};
 
 /// The size of the operands being operated on.
 enum Opc {
@@ -32,7 +32,7 @@ pub struct LoadLiteral {
     rt: u8,
 
     /// The PC-relative number of instructions to load the value from.
-    imm19: i32,
+    offset: InstructionOffset,
 
     /// The size of the operands being operated on.
     opc: Opc
@@ -41,8 +41,8 @@ pub struct LoadLiteral {
 impl LoadLiteral {
     /// LDR (load literal)
     /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--literal---Load-Register--literal--?lang=en
-    pub fn ldr_literal(rt: u8, imm19: i32, num_bits: u8) -> Self {
-        Self { rt, imm19, opc: num_bits.into() }
+    pub fn ldr_literal(rt: u8, offset: InstructionOffset, num_bits: u8) -> Self {
+        Self { rt, offset, opc: num_bits.into() }
     }
 }
 
@@ -56,7 +56,7 @@ impl From<LoadLiteral> for u32 {
         | ((inst.opc as u32) << 30)
         | (1 << 28)
         | (FAMILY << 25)
-        | (truncate_imm::<_, 19>(inst.imm19) << 5)
+        | (truncate_imm::<_, 19>(inst.offset) << 5)
         | (inst.rt as u32)
     }
 }
@@ -75,14 +75,14 @@ mod tests {
 
     #[test]
     fn test_ldr_positive() {
-        let inst = LoadLiteral::ldr_literal(0, 5, 64);
+        let inst = LoadLiteral::ldr_literal(0, 5.into(), 64);
         let result: u32 = inst.into();
         assert_eq!(0x580000a0, result);
     }
 
     #[test]
     fn test_ldr_negative() {
-        let inst = LoadLiteral::ldr_literal(0, -5, 64);
+        let inst = LoadLiteral::ldr_literal(0, (-5).into(), 64);
         let result: u32 = inst.into();
         assert_eq!(0x58ffff60, result);
     }
diff --git a/yjit/src/asm/arm64/mod.rs b/yjit/src/asm/arm64/mod.rs
index b73b3125e2..420151c6d1 100644
--- a/yjit/src/asm/arm64/mod.rs
+++ b/yjit/src/asm/arm64/mod.rs
@@ -190,15 +190,9 @@ pub const fn b_offset_fits_bits(offset: i64) -> bool {
 }
 
 /// B - branch without link (offset is number of instructions to jump)
-pub fn b(cb: &mut CodeBlock, imm26: A64Opnd) {
-    let bytes: [u8; 4] = match imm26 {
-        A64Opnd::Imm(imm26) => {
-            assert!(b_offset_fits_bits(imm26), "The immediate operand must be 26 bits or less.");
-
-            Call::b(imm26 as i32).into()
-        },
-        _ => panic!("Invalid operand combination to b instruction.")
-    };
+pub fn b(cb: &mut CodeBlock, offset: InstructionOffset) {
+    assert!(b_offset_fits_bits(offset.into()), "The immediate operand must be 26 bits or less.");
+    let bytes: [u8; 4] = Call::b(offset).into();
 
     cb.write_bytes(&bytes);
 }
@@ -208,33 +202,21 @@ pub fn b(cb: &mut CodeBlock, imm26: A64Opnd) {
 /// value into a register first, then use the b.cond instruction to skip past a
 /// direct jump.
 pub const fn bcond_offset_fits_bits(offset: i64) -> bool {
-    imm_fits_bits(offset, 21) && (offset & 0b11 == 0)
+    imm_fits_bits(offset, 19)
 }
 
 /// B.cond - branch to target if condition is true
-pub fn bcond(cb: &mut CodeBlock, cond: u8, byte_offset: A64Opnd) {
-    let bytes: [u8; 4] = match byte_offset {
-        A64Opnd::Imm(imm) => {
-            assert!(bcond_offset_fits_bits(imm), "The immediate operand must be 21 bits or less and be aligned to a 2-bit boundary.");
-
-            BranchCond::bcond(cond, (imm / 4) as i32).into()
-        },
-        _ => panic!("Invalid operand combination to bcond instruction."),
-    };
+pub fn bcond(cb: &mut CodeBlock, cond: u8, offset: InstructionOffset) {
+    assert!(bcond_offset_fits_bits(offset.into()), "The offset must be 19 bits or less.");
+    let bytes: [u8; 4] = BranchCond::bcond(cond, offset).into();
 
     cb.write_bytes(&bytes);
 }
 
 /// BL - branch with link (offset is number of instructions to jump)
-pub fn bl(cb: &mut CodeBlock, imm26: A64Opnd) {
-    let bytes: [u8; 4] = match imm26 {
-        A64Opnd::Imm(imm26) => {
-            assert!(b_offset_fits_bits(imm26), "The immediate operand must be 26 bits or less.");
-
-            Call::bl(imm26 as i32).into()
-        },
-        _ => panic!("Invalid operand combination to bl instruction.")
-    };
+pub fn bl(cb: &mut CodeBlock, offset: InstructionOffset) {
+    assert!(b_offset_fits_bits(offset.into()), "The offset must be 26 bits or less.");
+    let bytes: [u8; 4] = Call::bl(offset).into();
 
     cb.write_bytes(&bytes);
 }
@@ -413,7 +395,7 @@ pub fn ldr(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd, rm: A64Opnd) {
 }
 
 /// LDR - load a PC-relative memory address into a register
-pub fn ldr_literal(cb: &mut CodeBlock, rt: A64Opnd, rn: i32) {
+pub fn ldr_literal(cb: &mut CodeBlock, rt: A64Opnd, rn: InstructionOffset) {
     let bytes: [u8; 4] = match rt {
         A64Opnd::Reg(rt) => {
             LoadLiteral::ldr_literal(rt.reg_no, rn, rt.num_bits).into()
@@ -1087,45 +1069,52 @@ mod tests {
 
     #[test]
     fn test_bcond() {
-        check_bytes("01200054", |cb| bcond(cb, Condition::NE, A64Opnd::new_imm(0x400)));
+        let offset = InstructionOffset::from_insns(0x100);
+        check_bytes("01200054", |cb| bcond(cb, Condition::NE, offset));
     }
 
     #[test]
     fn test_b() {
-        check_bytes("ffffff15", |cb| b(cb, A64Opnd::new_imm((1 << 25) - 1)));
+        let offset = InstructionOffset::from_insns((1 << 25) - 1);
+        check_bytes("ffffff15", |cb| b(cb, offset));
     }
 
     #[test]
     #[should_panic]
     fn test_b_too_big() {
         // There are 26 bits available
-        check_bytes("", |cb| b(cb, A64Opnd::new_imm(1 << 25)));
+        let offset = InstructionOffset::from_insns(1 << 25);
+        check_bytes("", |cb| b(cb, offset));
     }
 
     #[test]
     #[should_panic]
     fn test_b_too_small() {
         // There are 26 bits available
-        check_bytes("", |cb| b(cb, A64Opnd::new_imm(-(1 << 25) - 1)));
+        let offset = InstructionOffset::from_insns(-(1 << 25) - 1);
+        check_bytes("", |cb| b(cb, offset));
     }
 
     #[test]
     fn test_bl() {
-        check_bytes("00000096", |cb| bl(cb, A64Opnd::new_imm(-(1 << 25))));
+        let offset = InstructionOffset::from_insns(-(1 << 25));
+        check_bytes("00000096", |cb| bl(cb, offset));
     }
 
     #[test]
     #[should_panic]
     fn test_bl_too_big() {
         // There are 26 bits available
-        check_bytes("", |cb| bl(cb, A64Opnd::new_imm(1 << 25)));
+        let offset = InstructionOffset::from_insns(1 << 25);
+        check_bytes("", |cb| bl(cb, offset));
     }
 
     #[test]
     #[should_panic]
     fn test_bl_too_small() {
         // There are 26 bits available
-        check_bytes("", |cb| bl(cb, A64Opnd::new_imm(-(1 << 25) - 1)));
+        let offset = InstructionOffset::from_insns(-(1 << 25) - 1);
+        check_bytes("", |cb| bl(cb, offset));
     }
 
     #[test]
@@ -1200,7 +1189,7 @@ mod tests {
 
     #[test]
     fn test_ldr_literal() {
-        check_bytes("40010058", |cb| ldr_literal(cb, X0, 10));
+        check_bytes("40010058", |cb| ldr_literal(cb, X0, 10.into()));
     }
 
     #[test]
diff --git a/yjit/src/backend/arm64/mod.rs b/yjit/src/backend/arm64/mod.rs
index 32db0ab3dc..446332788a 100644
--- a/yjit/src/backend/arm64/mod.rs
+++ b/yjit/src/backend/arm64/mod.rs
@@ -589,13 +589,15 @@ impl Assembler
                 Target::CodePtr(dst_ptr) => {
                     let dst_addr = dst_ptr.into_i64();
                     let src_addr = cb.get_write_ptr().into_i64();
-                    let offset = dst_addr - src_addr;
 
-                    let num_insns = if bcond_offset_fits_bits(offset) {
+                    let num_insns = if bcond_offset_fits_bits((dst_addr - src_addr) / 4) {
                         // If the jump offset fits into the conditional jump as
                         // an immediate value and it's properly aligned, then we
-                        // can use the b.cond instruction directly.
-                        bcond(cb, CONDITION, A64Opnd::new_imm(offset));
+                        // can use the b.cond instruction directly. We're safe
+                        // to use as i32 here since we already checked that it
+                        // fits.
+                        let bytes = (dst_addr - src_addr) as i32;
+                        bcond(cb, CONDITION, InstructionOffset::from_bytes(bytes));
 
                         // Here we're going to return 1 because we've only
                         // written out 1 instruction.
@@ -604,12 +606,12 @@ impl Assembler
                         // Otherwise, we need to load the address into a
                         // register and use the branch register instruction.
                         let dst_addr = dst_ptr.into_u64();
-                        let load_insns: i64 = emit_load_size(dst_addr).into();
+                        let load_insns: i32 = emit_load_size(dst_addr).into();
 
                         // We're going to write out the inverse condition so
                         // that if it doesn't match it will skip over the
                         // instructions used for branching.
-                        bcond(cb, Condition::inverse(CONDITION), A64Opnd::new_imm((load_insns + 2) * 4));
+                        bcond(cb, Condition::inverse(CONDITION), (load_insns + 2).into());
                         emit_load_value(cb, Assembler::SCRATCH0, dst_addr);
                         br(cb, Assembler::SCRATCH0);
 
@@ -630,7 +632,8 @@ impl Assembler
                     // offset. We're going to assume we can fit into a single
                     // b.cond instruction. It will panic otherwise.
                     cb.label_ref(label_idx, 4, |cb, src_addr, dst_addr| {
-                        bcond(cb, CONDITION, A64Opnd::new_imm(dst_addr - (src_addr - 4)));
+                        let bytes: i32 = (dst_addr - (src_addr - 4)).try_into().unwrap();
+                        bcond(cb, CONDITION, InstructionOffset::from_bytes(bytes));
                     });
                 },
                 Target::FunPtr(_) => unreachable!()
@@ -756,8 +759,8 @@ impl Assembler
                             // references to GC'd Value operands. If the value
                             // being loaded is a heap object, we'll report that
                             // back out to the gc_offsets list.
-                            ldr_literal(cb, out.into(), 2);
-                            b(cb, A64Opnd::new_imm(1 + (SIZEOF_VALUE as i64) / 4));
+                            ldr_literal(cb, out.into(), 2.into());
+                            b(cb, InstructionOffset::from_bytes(4 + (SIZEOF_VALUE as i32)));
                             cb.write_bytes(&value.as_u64().to_le_bytes());
 
                             let ptr_offset: u32 = (cb.get_write_pos() as u32) - (SIZEOF_VALUE as u32);
@@ -844,14 +847,11 @@ impl Assembler
                     // The offset to the call target in bytes
                     let src_addr = cb.get_write_ptr().into_i64();
                     let dst_addr = target.unwrap_fun_ptr() as i64;
-                    let offset = dst_addr - src_addr;
-                    // The offset in instruction count for BL's immediate
-                    let offset = offset / 4;
 
                     // Use BL if the offset is short enough to encode as an immediate.
                     // Otherwise, use BLR with a register.
-                    if b_offset_fits_bits(offset) {
-                        bl(cb, A64Opnd::new_imm(offset));
+                    if b_offset_fits_bits((dst_addr - src_addr) / 4) {
+                        bl(cb, InstructionOffset::from_bytes((dst_addr - src_addr) as i32));
                     } else {
                         emit_load_value(cb, Self::SCRATCH0, dst_addr as u64);
                         blr(cb, Self::SCRATCH0);
@@ -875,19 +875,22 @@ impl Assembler
                             let src_addr = cb.get_write_ptr().into_i64();
                             let dst_addr = dst_ptr.into_i64();
 
-                            // The offset between the two instructions in bytes.
-                            // Note that when we encode this into a b
-                            // instruction, we'll divide by 4 because it accepts
-                            // the number of instructions to jump over.
-                            let offset = dst_addr - src_addr;
-                            let offset = offset / 4;
-
                             // If the offset is short enough, then we'll use the
                             // branch instruction. Otherwise, we'll move the
                             // destination into a register and use the branch
                             // register instruction.
-                            let num_insns = emit_load_value(cb, Self::SCRATCH0, dst_addr as u64);
-                            br(cb, Self::SCRATCH0);
+                            let num_insns = if b_offset_fits_bits((dst_addr - src_addr) / 4) {
+                                b(cb, InstructionOffset::from_bytes((dst_addr - src_addr) as i32));
+                                0
+                            } else {
+                                let num_insns = emit_load_value(cb, Self::SCRATCH0, dst_addr as u64);
+                                br(cb, Self::SCRATCH0);
+                                num_insns
+                            };
+
+                            // Make sure it's always a consistent number of
+                            // instructions in case it gets patched and has to
+                            // use the other branch.
                             for _ in num_insns..4 {
                                 nop(cb);
                             }
@@ -899,7 +902,8 @@ impl Assembler
                             // to assume we can fit into a single b instruction.
                             // It will panic otherwise.
                             cb.label_ref(*label_idx, 4, |cb, src_addr, dst_addr| {
-                                b(cb, A64Opnd::new_imm((dst_addr - (src_addr - 4)) / 4));
+                                let bytes: i32 = (dst_addr - (src_addr - 4)).try_into().unwrap();
+                                b(cb, InstructionOffset::from_bytes(bytes));
                             });
                         },
                         _ => unreachable!()