diff options
Diffstat (limited to 'roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S')
-rw-r--r-- | roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S | 199 |
1 files changed, 199 insertions, 0 deletions
diff --git a/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S new file mode 100644 index 000000000..f97484055 --- /dev/null +++ b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S @@ -0,0 +1,199 @@ +//
+// Copyright (c) 2012 - 2016, Linaro Limited
+// All rights reserved.
+// Copyright (c) 2015 ARM Ltd
+// All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause-Patent
+//
+
+// Assumptions:
+//
+// ARMv8-a, AArch64, unaligned accesses
+//
+//
+
+#define dstin x0
+#define count x1
+#define val x2
+#define valw w2
+#define dst x3
+#define dstend x4
+#define tmp1 x5
+#define tmp1w w5
+#define tmp2 x6
+#define tmp2w w6
+#define zva_len x7
+#define zva_lenw w7
+
+#define L(l) .L ## l
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem16)
+ASM_PFX(InternalMemSetMem16):
+ dup v0.8H, valw
+ lsl count, count, #1
+ b 0f
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem32)
+ASM_PFX(InternalMemSetMem32):
+ dup v0.4S, valw
+ lsl count, count, #2
+ b 0f
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem64)
+ASM_PFX(InternalMemSetMem64):
+ dup v0.2D, val
+ lsl count, count, #3
+ b 0f
+
+ASM_GLOBAL ASM_PFX(InternalMemZeroMem)
+ASM_PFX(InternalMemZeroMem):
+ movi v0.16B, #0
+ b 0f
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem)
+ASM_PFX(InternalMemSetMem):
+ dup v0.16B, valw
+0: add dstend, dstin, count
+ mov val, v0.D[0]
+
+ cmp count, 96
+ b.hi L(set_long)
+ cmp count, 16
+ b.hs L(set_medium)
+
+ // Set 0..15 bytes.
+ tbz count, 3, 1f
+ str val, [dstin]
+ str val, [dstend, -8]
+ ret
+ nop
+1: tbz count, 2, 2f
+ str valw, [dstin]
+ str valw, [dstend, -4]
+ ret
+2: cbz count, 3f
+ strb valw, [dstin]
+ tbz count, 1, 3f
+ strh valw, [dstend, -2]
+3: ret
+
+ // Set 17..96 bytes.
+L(set_medium):
+ str q0, [dstin]
+ tbnz count, 6, L(set96)
+ str q0, [dstend, -16]
+ tbz count, 5, 1f
+ str q0, [dstin, 16]
+ str q0, [dstend, -32]
+1: ret
+
+ .p2align 4
+ // Set 64..96 bytes. Write 64 bytes from the start and
+ // 32 bytes from the end.
+L(set96):
+ str q0, [dstin, 16]
+ stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 3
+ nop
+L(set_long):
+ bic dst, dstin, 15
+ str q0, [dstin]
+ cmp count, 256
+ ccmp val, 0, 0, cs
+ b.eq L(try_zva)
+L(no_zva):
+ sub count, dstend, dst // Count is 16 too large.
+ add dst, dst, 16
+ sub count, count, 64 + 16 // Adjust count and bias for loop.
+1: stp q0, q0, [dst], 64
+ stp q0, q0, [dst, -32]
+L(tail64):
+ subs count, count, 64
+ b.hi 1b
+2: stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 3
+L(try_zva):
+ mrs tmp1, dczid_el0
+ tbnz tmp1w, 4, L(no_zva)
+ and tmp1w, tmp1w, 15
+ cmp tmp1w, 4 // ZVA size is 64 bytes.
+ b.ne L(zva_128)
+
+ // Write the first and last 64 byte aligned block using stp rather
+ // than using DC ZVA. This is faster on some cores.
+L(zva_64):
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+ bic dst, dst, 63
+ stp q0, q0, [dst, 64]
+ stp q0, q0, [dst, 96]
+ sub count, dstend, dst // Count is now 128 too large.
+ sub count, count, 128+64+64 // Adjust count and bias for loop.
+ add dst, dst, 128
+ nop
+1: dc zva, dst
+ add dst, dst, 64
+ subs count, count, 64
+ b.hi 1b
+ stp q0, q0, [dst, 0]
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 3
+L(zva_128):
+ cmp tmp1w, 5 // ZVA size is 128 bytes.
+ b.ne L(zva_other)
+
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dst, 64]
+ stp q0, q0, [dst, 96]
+ bic dst, dst, 127
+ sub count, dstend, dst // Count is now 128 too large.
+ sub count, count, 128+128 // Adjust count and bias for loop.
+ add dst, dst, 128
+1: dc zva, dst
+ add dst, dst, 128
+ subs count, count, 128
+ b.hi 1b
+ stp q0, q0, [dstend, -128]
+ stp q0, q0, [dstend, -96]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+L(zva_other):
+ mov tmp2w, 4
+ lsl zva_lenw, tmp2w, tmp1w
+ add tmp1, zva_len, 64 // Max alignment bytes written.
+ cmp count, tmp1
+ blo L(no_zva)
+
+ sub tmp2, zva_len, 1
+ add tmp1, dst, zva_len
+ add dst, dst, 16
+ subs count, tmp1, dst // Actual alignment bytes to write.
+ bic tmp1, tmp1, tmp2 // Aligned dc zva start address.
+ beq 2f
+1: stp q0, q0, [dst], 64
+ stp q0, q0, [dst, -32]
+ subs count, count, 64
+ b.hi 1b
+2: mov dst, tmp1
+ sub count, dstend, tmp1 // Remaining bytes to write.
+ subs count, count, zva_len
+ b.lo 4f
+3: dc zva, dst
+ add dst, dst, zva_len
+ subs count, count, zva_len
+ b.hs 3b
+4: add count, count, zva_len
+ b L(tail64)
|