Index: src/aarch64/sysv.S
--- src/aarch64/sysv.S.orig
+++ src/aarch64/sysv.S
@@ -78,6 +78,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 	cfi_startproc
 CNAME(ffi_call_SYSV):
+	bti	c
 	/* Sign the lr with x1 since that is where it will be stored */
 	SIGN_LR_WITH_REG(x1)
 
@@ -138,78 +139,142 @@ CNAME(ffi_call_SYSV):
 	/* Save the return value as directed.  */
 	adr	x5, 0f
 	and	w4, w4, #AARCH64_RET_MASK
-	add	x5, x5, x4, lsl #3
+	add	x5, x5, x4, lsl #4
 	br	x5
 
-	/* Note that each table entry is 2 insns, and thus 8 bytes.
+	/* Note that each table entry is 4 insns, and thus 16 bytes.
 	   For integer data, note that we're storing into ffi_arg
 	   and therefore we want to extend to 64 bits; these types
 	   have two consecutive entries allocated for them.  */
 	.align	4
-0:	b 99f				/* VOID */
+0:	bti	j			/* VOID */
+	b 99f
+	nop 
 	nop
-1:	str	x0, [x3]		/* INT64 */
+1:	bti	j			/* INT64 */
+	str	x0, [x3]
 	b 99f
-2:	stp	x0, x1, [x3]		/* INT128 */
+	nop
+2:	bti	j			/* INT128 */
+	stp	x0, x1, [x3]
 	b 99f
+	nop
 3:	brk	#1000			/* UNUSED */
 	b 99f
+	nop
+	nop
 4:	brk	#1000			/* UNUSED */
 	b 99f
+	nop
+	nop
 5:	brk	#1000			/* UNUSED */
 	b 99f
+	nop
+	nop
 6:	brk	#1000			/* UNUSED */
 	b 99f
+	nop
+	nop
 7:	brk	#1000			/* UNUSED */
 	b 99f
-8:	st4	{ v0.s, v1.s, v2.s, v3.s }[0], [x3]	/* S4 */
+	nop
+	nop
+8:	bti	j			/* S4 */
+	st4	{ v0.s, v1.s, v2.s, v3.s }[0], [x3]
 	b 99f
-9:	st3	{ v0.s, v1.s, v2.s }[0], [x3]	/* S3 */
+	nop
+9:	bti	j			/* S3 */
+	st3	{ v0.s, v1.s, v2.s }[0], [x3]
 	b 99f
-10:	stp	s0, s1, [x3]		/* S2 */
+	nop
+10:	bti	j			/* S2 */
+	stp	s0, s1, [x3]
 	b 99f
-11:	str	s0, [x3]		/* S1 */
+	nop
+11:	bti	j
+	str	s0, [x3]		/* S1 */
 	b 99f
-12:	st4	{ v0.d, v1.d, v2.d, v3.d }[0], [x3]	/* D4 */
+	nop
+12:	bti	j			/* D4 */
+	st4	{ v0.d, v1.d, v2.d, v3.d }[0], [x3]
 	b 99f
-13:	st3	{ v0.d, v1.d, v2.d }[0], [x3]	/* D3 */
+	nop
+13:	bti	j			/* D3 */
+	st3	{ v0.d, v1.d, v2.d }[0], [x3]
 	b 99f
-14:	stp	d0, d1, [x3]		/* D2 */
+	nop
+14:	bti	j			/* D2 */
+	stp	d0, d1, [x3]
 	b 99f
-15:	str	d0, [x3]		/* D1 */
+	nop
+15:	bti	j			/* D1 */
+	str	d0, [x3]
 	b 99f
-16:	str	q3, [x3, #48]		/* Q4 */
 	nop
-17:	str	q2, [x3, #32]		/* Q3 */
+16:	bti	j			/* Q4 */
+	str	q3, [x3, #48]
 	nop
-18:	stp	q0, q1, [x3]		/* Q2 */
+	nop
+17:	bti	j			/* Q3 */
+	str	q2, [x3, #32]
+	nop
+	nop
+18:	bti	j			/* Q2 */
+	stp	q0, q1, [x3]
 	b 99f
-19:	str	q0, [x3]		/* Q1 */
+	nop
+19:	bti	j			/* Q1 */
+	str	q0, [x3]
 	b 99f
-20:	uxtb	w0, w0			/* UINT8 */
+	nop
+20:	bti	j			/* UINT8 */
+	uxtb	w0, w0
 	str	x0, [x3]
+	nop
 21:	b 99f				/* reserved */
 	nop
-22:	uxth	w0, w0			/* UINT16 */
+	nop
+	nop
+22:	bti	j			/* UINT16 */
+	uxth	w0, w0
 	str	x0, [x3]
+	nop
 23:	b 99f				/* reserved */
 	nop
-24:	mov	w0, w0			/* UINT32 */
+	nop
+	nop
+24:	bti	j			/* UINT32 */
+	mov	w0, w0
 	str	x0, [x3]
+	nop
 25:	b 99f				/* reserved */
 	nop
-26:	sxtb	x0, w0			/* SINT8 */
+	nop
+	nop
+26:	bti	j			/* SINT8 */
+	sxtb	x0, w0
 	str	x0, [x3]
+	nop
 27:	b 99f				/* reserved */
 	nop
-28:	sxth	x0, w0			/* SINT16 */
+	nop
+	nop
+28:	bti	j			/* SINT16 */
+	sxth	x0, w0
 	str	x0, [x3]
+	nop
 29:	b 99f				/* reserved */
 	nop
-30:	sxtw	x0, w0			/* SINT32 */
+	nop
+	nop
+30:	bti	j			/* SINT32 */
+	sxtw	x0, w0
 	str	x0, [x3]
+	nop
 31:	b 99f				/* reserved */
 	nop
+	nop
+	nop
 
 	/* Return now that result has been populated. */
 99:
@@ -246,6 +311,7 @@ CNAME(ffi_call_SYSV):
 	.align 4
 CNAME(ffi_closure_SYSV_V):
 	cfi_startproc
+	bti	c
 	SIGN_LR
 	stp     x29, x30, [sp, #-ffi_closure_SYSV_FS]!
 	cfi_adjust_cfa_offset (ffi_closure_SYSV_FS)
@@ -270,6 +336,7 @@ CNAME(ffi_closure_SYSV_V):
 	.align	4
 	cfi_startproc
 CNAME(ffi_closure_SYSV):
+	bti	c
 	SIGN_LR
 	stp     x29, x30, [sp, #-ffi_closure_SYSV_FS]!
 	cfi_adjust_cfa_offset (ffi_closure_SYSV_FS)
@@ -299,74 +366,136 @@ CNAME(ffi_closure_SYSV):
 	/* Load the return value as directed.  */
 	adr	x1, 0f
 	and	w0, w0, #AARCH64_RET_MASK
-	add	x1, x1, x0, lsl #3
+	add	x1, x1, x0, lsl #4
 	add	x3, sp, #16+CALL_CONTEXT_SIZE
 	br	x1
 
-	/* Note that each table entry is 2 insns, and thus 8 bytes.  */
+	/* Note that each table entry is 4 insns, and thus 16 bytes.  */
 	.align	4
-0:	b	99f			/* VOID */
+0:	bti	j			/* VOID */
+	b	99f
 	nop
-1:	ldr	x0, [x3]		/* INT64 */
+	nop
+1:	bti	j			/* INT64 */
+	ldr	x0, [x3]
 	b	99f
-2:	ldp	x0, x1, [x3]		/* INT128 */
+	nop
+2:	bti	j			/* INT128 */
+	ldp	x0, x1, [x3]
 	b	99f
+	nop
 3:	brk	#1000			/* UNUSED */
 	nop
+	nop
+	nop
 4:	brk	#1000			/* UNUSED */
 	nop
+	nop
+	nop
 5:	brk	#1000			/* UNUSED */
 	nop
+	nop
+	nop
 6:	brk	#1000			/* UNUSED */
 	nop
+	nop
+	nop
 7:	brk	#1000			/* UNUSED */
 	nop
-8:	ldr	s3, [x3, #12]		/* S4 */
 	nop
-9:	ldr	s2, [x3, #8]		/* S3 */
 	nop
-10:	ldp	s0, s1, [x3]		/* S2 */
+8:	bti	j			/* S4 */
+	ldr	s3, [x3, #12]
+	nop
+	nop
+9:	bti	j			/* S3 */
+	ldr	s2, [x3, #8]
+	nop
+	nop
+10:	bti	j			/* S2 */
+	ldp	s0, s1, [x3]
 	b	99f
-11:	ldr	s0, [x3]		/* S1 */
+	nop
+11:	bti	j			/* S1 */
+	ldr	s0, [x3]
 	b	99f
-12:	ldr	d3, [x3, #24]		/* D4 */
 	nop
-13:	ldr	d2, [x3, #16]		/* D3 */
+12:	bti	j			/* D4 */
+	ldr	d3, [x3, #24]
 	nop
-14:	ldp	d0, d1, [x3]		/* D2 */
+	nop
+13:	bti	j			/* D3 */
+	ldr	d2, [x3, #16]
+	nop
+	nop
+14:	bti	j			/* D2 */
+	ldp	d0, d1, [x3]
 	b	99f
-15:	ldr	d0, [x3]		/* D1 */
+	nop
+15:	bti	j			/* D1 */
+	ldr	d0, [x3]
 	b	99f
-16:	ldr	q3, [x3, #48]		/* Q4 */
 	nop
-17:	ldr	q2, [x3, #32]		/* Q3 */
+16:	bti	j			/* Q4 */
+	ldr	q3, [x3, #48]
 	nop
-18:	ldp	q0, q1, [x3]		/* Q2 */
+	nop
+17:	bti	j			/* Q3 */
+	ldr	q2, [x3, #32]
+	nop
+	nop
+18:	bti	j			/* Q2 */
+	ldp	q0, q1, [x3]
 	b	99f
-19:	ldr	q0, [x3]		/* Q1 */
+	nop
+19:	bti	j			/* Q1 */
+	ldr	q0, [x3]
 	b	99f
-20:	ldrb	w0, [x3, #BE(7)]	/* UINT8 */
+	nop
+20:	bti	j			/* UINT8 */
+	ldrb	w0, [x3, #BE(7)]
 	b	99f
+	nop
 21:	brk	#1000			/* reserved */
 	nop
-22:	ldrh	w0, [x3, #BE(6)]	/* UINT16 */
+	nop
+	nop
+22:	bti	j			/* UINT16 */
+	ldrh	w0, [x3, #BE(6)]
 	b	99f
+	nop
 23:	brk	#1000			/* reserved */
 	nop
-24:	ldr	w0, [x3, #BE(4)]	/* UINT32 */
+	nop
+	nop
+24:	bti	j			/* UINT32 */
+	ldr	w0, [x3, #BE(4)]
 	b	99f
+	nop
 25:	brk	#1000			/* reserved */
 	nop
-26:	ldrsb	x0, [x3, #BE(7)]	/* SINT8 */
+	nop
+	nop
+26:	bti	j			/* SINT8 */
+	ldrsb	x0, [x3, #BE(7)]
 	b	99f
+	nop
 27:	brk	#1000			/* reserved */
 	nop
-28:	ldrsh	x0, [x3, #BE(6)]	/* SINT16 */
+	nop
+	nop
+28:	bti	j			/* SINT16 */
+	ldrsh	x0, [x3, #BE(6)]
 	b	99f
+	nop
 29:	brk	#1000			/* reserved */
 	nop
-30:	ldrsw	x0, [x3, #BE(4)]	/* SINT32 */
 	nop
+	nop
+30:	bti	j			/* SINT32 */
+	ldrsw	x0, [x3, #BE(4)]
+	nop
+	nop
 31:					/* reserved */
 99:	ldp     x29, x30, [sp], #ffi_closure_SYSV_FS
 	cfi_adjust_cfa_offset (-ffi_closure_SYSV_FS)
@@ -479,6 +608,7 @@ CNAME(ffi_closure_trampoline_table_page):
 	.align 4
 CNAME(ffi_go_closure_SYSV_V):
 	cfi_startproc
+	bti	c
 	stp     x29, x30, [sp, #-ffi_closure_SYSV_FS]!
 	cfi_adjust_cfa_offset (ffi_closure_SYSV_FS)
 	cfi_rel_offset (x29, 0)
@@ -502,6 +632,7 @@ CNAME(ffi_go_closure_SYSV_V):
 	.align	4
 	cfi_startproc
 CNAME(ffi_go_closure_SYSV):
+	bti	c
 	stp     x29, x30, [sp, #-ffi_closure_SYSV_FS]!
 	cfi_adjust_cfa_offset (ffi_closure_SYSV_FS)
 	cfi_rel_offset (x29, 0)
