difos/target/linux/generic/backport-5.4/080-wireguard-0068-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch

From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Tue, 3 Nov 2020 17:28:09 +0100
Subject: [PATCH] crypto: arm/chacha-neon - optimize for non-block size
 multiples

commit 86cd97ec4b943af35562a74688bc4e909b32c3d1 upstream.

The current NEON based ChaCha implementation for ARM is optimized for
multiples of 4x the ChaCha block size (64 bytes). This makes sense for
block encryption, but given that ChaCha is also often used in the
context of networking, it makes sense to consider arbitrary length
inputs as well.

For example, WireGuard typically uses 1420 byte packets, and performing
ChaCha encryption involves 5 invocations of chacha_4block_xor_neon()
and 3 invocations of chacha_block_xor_neon(), where the last one also
involves a memcpy() using a buffer on the stack to process the final
chunk of 1420 % 64 == 12 bytes.

Let's optimize for this case as well, by letting chacha_4block_xor_neon()
deal with any input size between 64 and 256 bytes, using NEON permutation
instructions and overlapping loads and stores. This way, the 140 byte
tail of a 1420 byte input buffer can simply be processed in one go.

This results in the following performance improvements for 1420 byte
blocks, without significant impact on power-of-2 input sizes. (Note
that Raspberry Pi is widely used in combination with a 32-bit kernel,
even though the core is 64-bit capable)

   Cortex-A8  (BeagleBone)       :   7%
   Cortex-A15 (Calxeda Midway)   :  21%
   Cortex-A53 (Raspberry Pi 3)   :   3%
   Cortex-A72 (Raspberry Pi 4)   :  19%

Cc: Eric Biggers <ebiggers@google.com>
Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 arch/arm/crypto/chacha-glue.c      | 34 +++++------
 arch/arm/crypto/chacha-neon-core.S | 97 +++++++++++++++++++++++++++---
 2 files changed, 107 insertions(+), 24 deletions(-)

--- a/arch/arm/crypto/chacha-glue.c
+++ b/arch/arm/crypto/chacha-glue.c
@@ -23,7 +23,7 @@
 asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
 				      int nrounds);
 asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
-				       int nrounds);
+				       int nrounds, unsigned int nbytes);
 asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
 asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
 
@@ -42,24 +42,24 @@ static void chacha_doneon(u32 *state, u8
 {
 	u8 buf[CHACHA_BLOCK_SIZE];
 
-	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
-		chacha_4block_xor_neon(state, dst, src, nrounds);
-		bytes -= CHACHA_BLOCK_SIZE * 4;
-		src += CHACHA_BLOCK_SIZE * 4;
-		dst += CHACHA_BLOCK_SIZE * 4;
-		state[12] += 4;
-	}
-	while (bytes >= CHACHA_BLOCK_SIZE) {
-		chacha_block_xor_neon(state, dst, src, nrounds);
-		bytes -= CHACHA_BLOCK_SIZE;
-		src += CHACHA_BLOCK_SIZE;
-		dst += CHACHA_BLOCK_SIZE;
-		state[12]++;
+	while (bytes > CHACHA_BLOCK_SIZE) {
+		unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
+
+		chacha_4block_xor_neon(state, dst, src, nrounds, l);
+		bytes -= l;
+		src += l;
+		dst += l;
+		state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
 	}
 	if (bytes) {
-		memcpy(buf, src, bytes);
-		chacha_block_xor_neon(state, buf, buf, nrounds);
-		memcpy(dst, buf, bytes);
+		const u8 *s = src;
+		u8 *d = dst;
+
+		if (bytes != CHACHA_BLOCK_SIZE)
+			s = d = memcpy(buf, src, bytes);
+		chacha_block_xor_neon(state, d, s, nrounds);
+		if (d != dst)
+			memcpy(dst, buf, bytes);
 	}
 }
 
--- a/arch/arm/crypto/chacha-neon-core.S
+++ b/arch/arm/crypto/chacha-neon-core.S
@@ -47,6 +47,7 @@
   */
 
 #include <linux/linkage.h>
+#include <asm/cache.h>
 
 	.text
 	.fpu		neon
@@ -205,7 +206,7 @@ ENDPROC(hchacha_block_neon)
 
 	.align		5
 ENTRY(chacha_4block_xor_neon)
-	push		{r4-r5}
+	push		{r4, lr}
 	mov		r4, sp			// preserve the stack pointer
 	sub		ip, sp, #0x20		// allocate a 32 byte buffer
 	bic		ip, ip, #0x1f		// aligned to 32 bytes
@@ -229,10 +230,10 @@ ENTRY(chacha_4block_xor_neon)
 	vld1.32		{q0-q1}, [r0]
 	vld1.32		{q2-q3}, [ip]
 
-	adr		r5, .Lctrinc
+	adr		lr, .Lctrinc
 	vdup.32		q15, d7[1]
 	vdup.32		q14, d7[0]
-	vld1.32		{q4}, [r5, :128]
+	vld1.32		{q4}, [lr, :128]
 	vdup.32		q13, d6[1]
 	vdup.32		q12, d6[0]
 	vdup.32		q11, d5[1]
@@ -455,7 +456,7 @@ ENTRY(chacha_4block_xor_neon)
 
 	// Re-interleave the words in the first two rows of each block (x0..7).
 	// Also add the counter values 0-3 to x12[0-3].
-	  vld1.32	{q8}, [r5, :128]	// load counter values 0-3
+	  vld1.32	{q8}, [lr, :128]	// load counter values 0-3
 	vzip.32		q0, q1			// => (0 1 0 1) (0 1 0 1)
 	vzip.32		q2, q3			// => (2 3 2 3) (2 3 2 3)
 	vzip.32		q4, q5			// => (4 5 4 5) (4 5 4 5)
@@ -493,6 +494,8 @@ ENTRY(chacha_4block_xor_neon)
 
 	// Re-interleave the words in the last two rows of each block (x8..15).
 	vld1.32		{q8-q9}, [sp, :256]
+	  mov		sp, r4		// restore original stack pointer
+	  ldr		r4, [r4, #8]	// load number of bytes
 	vzip.32		q12, q13	// => (12 13 12 13) (12 13 12 13)
 	vzip.32		q14, q15	// => (14 15 14 15) (14 15 14 15)
 	vzip.32		q8, q9		// => (8 9 8 9) (8 9 8 9)
@@ -520,41 +523,121 @@ ENTRY(chacha_4block_xor_neon)
 	// XOR the rest of the data with the keystream
 
 	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #96
 	veor		q0, q0, q8
 	veor		q1, q1, q12
+	ble		.Lle96
 	vst1.8		{q0-q1}, [r1]!
 
 	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
 	veor		q0, q0, q2
 	veor		q1, q1, q6
+	ble		.Lle128
 	vst1.8		{q0-q1}, [r1]!
 
 	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
 	veor		q0, q0, q10
 	veor		q1, q1, q14
+	ble		.Lle160
 	vst1.8		{q0-q1}, [r1]!
 
 	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
 	veor		q0, q0, q4
 	veor		q1, q1, q5
+	ble		.Lle192
 	vst1.8		{q0-q1}, [r1]!
 
 	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
 	veor		q0, q0, q9
 	veor		q1, q1, q13
+	ble		.Lle224
 	vst1.8		{q0-q1}, [r1]!
 
 	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
 	veor		q0, q0, q3
 	veor		q1, q1, q7
+	blt		.Llt256
+.Lout:
 	vst1.8		{q0-q1}, [r1]!
 
 	vld1.8		{q0-q1}, [r2]
-	  mov		sp, r4		// restore original stack pointer
 	veor		q0, q0, q11
 	veor		q1, q1, q15
 	vst1.8		{q0-q1}, [r1]
 
-	pop		{r4-r5}
-	bx		lr
+	pop		{r4, pc}
+
+.Lle192:
+	vmov		q4, q9
+	vmov		q5, q13
+
+.Lle160:
+	// nothing to do
+
+.Lfinalblock:
+	// Process the final block if processing less than 4 full blocks.
+	// Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
+	// previous 32 byte output block that still needs to be written at
+	// [r1] in q0-q1.
+	beq		.Lfullblock
+
+.Lpartialblock:
+	adr		lr, .Lpermute + 32
+	add		r2, r2, r4
+	add		lr, lr, r4
+	add		r4, r4, r1
+
+	vld1.8		{q2-q3}, [lr]
+	vld1.8		{q6-q7}, [r2]
+
+	add		r4, r4, #32
+
+	vtbl.8		d4, {q4-q5}, d4
+	vtbl.8		d5, {q4-q5}, d5
+	vtbl.8		d6, {q4-q5}, d6
+	vtbl.8		d7, {q4-q5}, d7
+
+	veor		q6, q6, q2
+	veor		q7, q7, q3
+
+	vst1.8		{q6-q7}, [r4]	// overlapping stores
+	vst1.8		{q0-q1}, [r1]
+	pop		{r4, pc}
+
+.Lfullblock:
+	vmov		q11, q4
+	vmov		q15, q5
+	b		.Lout
+.Lle96:
+	vmov		q4, q2
+	vmov		q5, q6
+	b		.Lfinalblock
+.Lle128:
+	vmov		q4, q10
+	vmov		q5, q14
+	b		.Lfinalblock
+.Lle224:
+	vmov		q4, q3
+	vmov		q5, q7
+	b		.Lfinalblock
+.Llt256:
+	vmov		q4, q11
+	vmov		q5, q15
+	b		.Lpartialblock
 ENDPROC(chacha_4block_xor_neon)
+
+	.align		L1_CACHE_SHIFT
+.Lpermute:
+	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
kernel-5.4: backport fd16931a2f51 for chacha neon Without this patch, the chacha block counter is not incremented on neon rounds, resulting in incorrect calculations and corrupt packets. This also switches to using `--no-numbered --zero-commit` so that future diffs are smaller. Reported-by: Hans Geiblinger <cybrnook2002@yahoo.com> Reviewed-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com> Cc: David Bauer <mail@david-bauer.net> Cc: Petr Štetiar <ynezz@true.cz> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> 2021-03-02 08:24:45 +00:00			`From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001`
kernel: 5.4: import wireguard backport Rather than using the clunky, old, slower wireguard-linux-compat out of tree module, this commit does a patch-by-patch backport of upstream's wireguard to 5.4. This specific backport is in widespread use, being part of SUSE's enterprise kernel, Oracle's enterprise kernel, Google's Android kernel, Gentoo's distro kernel, and probably more I've forgotten about. It's definately the "more proper" way of adding wireguard to a kernel than the ugly compat.h hell of the wireguard-linux-compat repo. And most importantly for OpenWRT, it allows using the same module configuration code for 5.10 as for 5.4, with no need for bifurcation. These patches are from the backport tree which is maintained in the open here: https://git.zx2c4.com/wireguard-linux/log/?h=backport-5.4.y I'll be sending PRs to update this as needed. Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> 2021-02-19 13:29:04 +00:00			`From: Ard Biesheuvel <ardb@kernel.org>`
			`Date: Tue, 3 Nov 2020 17:28:09 +0100`
kernel-5.4: backport fd16931a2f51 for chacha neon Without this patch, the chacha block counter is not incremented on neon rounds, resulting in incorrect calculations and corrupt packets. This also switches to using `--no-numbered --zero-commit` so that future diffs are smaller. Reported-by: Hans Geiblinger <cybrnook2002@yahoo.com> Reviewed-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com> Cc: David Bauer <mail@david-bauer.net> Cc: Petr Štetiar <ynezz@true.cz> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> 2021-03-02 08:24:45 +00:00			`Subject: [PATCH] crypto: arm/chacha-neon - optimize for non-block size`
kernel: 5.4: import wireguard backport Rather than using the clunky, old, slower wireguard-linux-compat out of tree module, this commit does a patch-by-patch backport of upstream's wireguard to 5.4. This specific backport is in widespread use, being part of SUSE's enterprise kernel, Oracle's enterprise kernel, Google's Android kernel, Gentoo's distro kernel, and probably more I've forgotten about. It's definately the "more proper" way of adding wireguard to a kernel than the ugly compat.h hell of the wireguard-linux-compat repo. And most importantly for OpenWRT, it allows using the same module configuration code for 5.10 as for 5.4, with no need for bifurcation. These patches are from the backport tree which is maintained in the open here: https://git.zx2c4.com/wireguard-linux/log/?h=backport-5.4.y I'll be sending PRs to update this as needed. Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> 2021-02-19 13:29:04 +00:00			`multiples`

			`commit 86cd97ec4b943af35562a74688bc4e909b32c3d1 upstream.`

			`The current NEON based ChaCha implementation for ARM is optimized for`
			`multiples of 4x the ChaCha block size (64 bytes). This makes sense for`
			`block encryption, but given that ChaCha is also often used in the`
			`context of networking, it makes sense to consider arbitrary length`
			`inputs as well.`

			`For example, WireGuard typically uses 1420 byte packets, and performing`
			`ChaCha encryption involves 5 invocations of chacha_4block_xor_neon()`
			`and 3 invocations of chacha_block_xor_neon(), where the last one also`
			`involves a memcpy() using a buffer on the stack to process the final`
			`chunk of 1420 % 64 == 12 bytes.`

			`Let's optimize for this case as well, by letting chacha_4block_xor_neon()`
			`deal with any input size between 64 and 256 bytes, using NEON permutation`
			`instructions and overlapping loads and stores. This way, the 140 byte`
			`tail of a 1420 byte input buffer can simply be processed in one go.`

			`This results in the following performance improvements for 1420 byte`
			`blocks, without significant impact on power-of-2 input sizes. (Note`
			`that Raspberry Pi is widely used in combination with a 32-bit kernel,`
			`even though the core is 64-bit capable)`

			`Cortex-A8 (BeagleBone) : 7%`
			`Cortex-A15 (Calxeda Midway) : 21%`
			`Cortex-A53 (Raspberry Pi 3) : 3%`
			`Cortex-A72 (Raspberry Pi 4) : 19%`

			`Cc: Eric Biggers <ebiggers@google.com>`
			`Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>`
			`Signed-off-by: Ard Biesheuvel <ardb@kernel.org>`
			`Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>`
			`Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>`
			`---`
			`arch/arm/crypto/chacha-glue.c \| 34 +++++------`
			`arch/arm/crypto/chacha-neon-core.S \| 97 +++++++++++++++++++++++++++---`
			`2 files changed, 107 insertions(+), 24 deletions(-)`

			`--- a/arch/arm/crypto/chacha-glue.c`
			`+++ b/arch/arm/crypto/chacha-glue.c`
			`@@ -23,7 +23,7 @@`
			`asmlinkage void chacha_block_xor_neon(const u32 state, u8 dst, const u8 *src,`
			`int nrounds);`
			`asmlinkage void chacha_4block_xor_neon(const u32 state, u8 dst, const u8 *src,`
			`- int nrounds);`
			`+ int nrounds, unsigned int nbytes);`
			`asmlinkage void hchacha_block_arm(const u32 state, u32 out, int nrounds);`
			`asmlinkage void hchacha_block_neon(const u32 state, u32 out, int nrounds);`

			`@@ -42,24 +42,24 @@ static void chacha_doneon(u32 *state, u8`
			`{`
			`u8 buf[CHACHA_BLOCK_SIZE];`

			`- while (bytes >= CHACHA_BLOCK_SIZE * 4) {`
			`- chacha_4block_xor_neon(state, dst, src, nrounds);`
			`- bytes -= CHACHA_BLOCK_SIZE * 4;`
			`- src += CHACHA_BLOCK_SIZE * 4;`
			`- dst += CHACHA_BLOCK_SIZE * 4;`
			`- state[12] += 4;`
			`- }`
			`- while (bytes >= CHACHA_BLOCK_SIZE) {`
			`- chacha_block_xor_neon(state, dst, src, nrounds);`
			`- bytes -= CHACHA_BLOCK_SIZE;`
			`- src += CHACHA_BLOCK_SIZE;`
			`- dst += CHACHA_BLOCK_SIZE;`
			`- state[12]++;`
			`+ while (bytes > CHACHA_BLOCK_SIZE) {`
			`+ unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);`
			`+`
			`+ chacha_4block_xor_neon(state, dst, src, nrounds, l);`
			`+ bytes -= l;`
			`+ src += l;`
			`+ dst += l;`
			`+ state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);`
			`}`
			`if (bytes) {`
			`- memcpy(buf, src, bytes);`
			`- chacha_block_xor_neon(state, buf, buf, nrounds);`
			`- memcpy(dst, buf, bytes);`
			`+ const u8 *s = src;`
			`+ u8 *d = dst;`
			`+`
			`+ if (bytes != CHACHA_BLOCK_SIZE)`
			`+ s = d = memcpy(buf, src, bytes);`
			`+ chacha_block_xor_neon(state, d, s, nrounds);`
			`+ if (d != dst)`
			`+ memcpy(dst, buf, bytes);`
			`}`
			`}`

			`--- a/arch/arm/crypto/chacha-neon-core.S`
			`+++ b/arch/arm/crypto/chacha-neon-core.S`
			`@@ -47,6 +47,7 @@`
			`*/`

			`#include <linux/linkage.h>`
			`+#include <asm/cache.h>`

			`.text`
			`.fpu neon`
			`@@ -205,7 +206,7 @@ ENDPROC(hchacha_block_neon)`

			`.align 5`
			`ENTRY(chacha_4block_xor_neon)`
			`- push {r4-r5}`
			`+ push {r4, lr}`
			`mov r4, sp // preserve the stack pointer`
			`sub ip, sp, #0x20 // allocate a 32 byte buffer`
			`bic ip, ip, #0x1f // aligned to 32 bytes`
			`@@ -229,10 +230,10 @@ ENTRY(chacha_4block_xor_neon)`
			`vld1.32 {q0-q1}, [r0]`
			`vld1.32 {q2-q3}, [ip]`

			`- adr r5, .Lctrinc`
			`+ adr lr, .Lctrinc`
			`vdup.32 q15, d7[1]`
			`vdup.32 q14, d7[0]`
			`- vld1.32 {q4}, [r5, :128]`
			`+ vld1.32 {q4}, [lr, :128]`
			`vdup.32 q13, d6[1]`
			`vdup.32 q12, d6[0]`
			`vdup.32 q11, d5[1]`
			`@@ -455,7 +456,7 @@ ENTRY(chacha_4block_xor_neon)`

			`// Re-interleave the words in the first two rows of each block (x0..7).`
			`// Also add the counter values 0-3 to x12[0-3].`
			`- vld1.32 {q8}, [r5, :128] // load counter values 0-3`
			`+ vld1.32 {q8}, [lr, :128] // load counter values 0-3`
			`vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1)`
			`vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3)`
			`vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5)`
			`@@ -493,6 +494,8 @@ ENTRY(chacha_4block_xor_neon)`

			`// Re-interleave the words in the last two rows of each block (x8..15).`
			`vld1.32 {q8-q9}, [sp, :256]`
			`+ mov sp, r4 // restore original stack pointer`
			`+ ldr r4, [r4, #8] // load number of bytes`
			`vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13)`
			`vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15)`
			`vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9)`
			`@@ -520,41 +523,121 @@ ENTRY(chacha_4block_xor_neon)`
			`// XOR the rest of the data with the keystream`

			`vld1.8 {q0-q1}, [r2]!`
			`+ subs r4, r4, #96`
			`veor q0, q0, q8`
			`veor q1, q1, q12`
			`+ ble .Lle96`
			`vst1.8 {q0-q1}, [r1]!`

			`vld1.8 {q0-q1}, [r2]!`
			`+ subs r4, r4, #32`
			`veor q0, q0, q2`
			`veor q1, q1, q6`
			`+ ble .Lle128`
			`vst1.8 {q0-q1}, [r1]!`

			`vld1.8 {q0-q1}, [r2]!`
			`+ subs r4, r4, #32`
			`veor q0, q0, q10`
			`veor q1, q1, q14`
			`+ ble .Lle160`
			`vst1.8 {q0-q1}, [r1]!`

			`vld1.8 {q0-q1}, [r2]!`
			`+ subs r4, r4, #32`
			`veor q0, q0, q4`
			`veor q1, q1, q5`
			`+ ble .Lle192`
			`vst1.8 {q0-q1}, [r1]!`

			`vld1.8 {q0-q1}, [r2]!`
			`+ subs r4, r4, #32`
			`veor q0, q0, q9`
			`veor q1, q1, q13`
			`+ ble .Lle224`
			`vst1.8 {q0-q1}, [r1]!`

			`vld1.8 {q0-q1}, [r2]!`
			`+ subs r4, r4, #32`
			`veor q0, q0, q3`
			`veor q1, q1, q7`
			`+ blt .Llt256`
			`+.Lout:`
			`vst1.8 {q0-q1}, [r1]!`

			`vld1.8 {q0-q1}, [r2]`
			`- mov sp, r4 // restore original stack pointer`
			`veor q0, q0, q11`
			`veor q1, q1, q15`
			`vst1.8 {q0-q1}, [r1]`

			`- pop {r4-r5}`
			`- bx lr`
			`+ pop {r4, pc}`
			`+`
			`+.Lle192:`
			`+ vmov q4, q9`
			`+ vmov q5, q13`
			`+`
			`+.Lle160:`
			`+ // nothing to do`
			`+`
			`+.Lfinalblock:`
			`+ // Process the final block if processing less than 4 full blocks.`
			`+ // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the`
			`+ // previous 32 byte output block that still needs to be written at`
			`+ // [r1] in q0-q1.`
			`+ beq .Lfullblock`
			`+`
			`+.Lpartialblock:`
			`+ adr lr, .Lpermute + 32`
			`+ add r2, r2, r4`
			`+ add lr, lr, r4`
			`+ add r4, r4, r1`
			`+`
			`+ vld1.8 {q2-q3}, [lr]`
			`+ vld1.8 {q6-q7}, [r2]`
			`+`
			`+ add r4, r4, #32`
			`+`
			`+ vtbl.8 d4, {q4-q5}, d4`
			`+ vtbl.8 d5, {q4-q5}, d5`
			`+ vtbl.8 d6, {q4-q5}, d6`
			`+ vtbl.8 d7, {q4-q5}, d7`
			`+`
			`+ veor q6, q6, q2`
			`+ veor q7, q7, q3`
			`+`
			`+ vst1.8 {q6-q7}, [r4] // overlapping stores`
			`+ vst1.8 {q0-q1}, [r1]`
			`+ pop {r4, pc}`
			`+`
			`+.Lfullblock:`
			`+ vmov q11, q4`
			`+ vmov q15, q5`
			`+ b .Lout`
			`+.Lle96:`
			`+ vmov q4, q2`
			`+ vmov q5, q6`
			`+ b .Lfinalblock`
			`+.Lle128:`
			`+ vmov q4, q10`
			`+ vmov q5, q14`
			`+ b .Lfinalblock`
			`+.Lle224:`
			`+ vmov q4, q3`
			`+ vmov q5, q7`
			`+ b .Lfinalblock`
			`+.Llt256:`
			`+ vmov q4, q11`
			`+ vmov q5, q15`
			`+ b .Lpartialblock`
			`ENDPROC(chacha_4block_xor_neon)`
			`+`
			`+ .align L1_CACHE_SHIFT`
			`+.Lpermute:`
			`+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07`
			`+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f`
			`+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17`
			`+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f`
			`+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07`
			`+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f`
			`+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17`
			`+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f`