diff options
author | Angelos Mouzakitis <a.mouzakitis@virtualopensystems.com> | 2023-10-10 14:33:42 +0000 |
---|---|---|
committer | Angelos Mouzakitis <a.mouzakitis@virtualopensystems.com> | 2023-10-10 14:33:42 +0000 |
commit | af1a266670d040d2f4083ff309d732d648afba2a (patch) | |
tree | 2fc46203448ddcc6f81546d379abfaeb323575e9 /roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S | |
parent | e02cda008591317b1625707ff8e115a4841aa889 (diff) |
Change-Id: Iaf8d18082d3991dec7c0ebbea540f092188eb4ec
Diffstat (limited to 'roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S')
-rw-r--r-- | roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S | 139 |
1 files changed, 139 insertions, 0 deletions
diff --git a/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S new file mode 100644 index 000000000..8673b76ec --- /dev/null +++ b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S @@ -0,0 +1,139 @@ +//
+// Copyright (c) 2014, ARM Limited
+// All rights Reserved.
+// SPDX-License-Identifier: BSD-2-Clause-Patent
+//
+
+// Assumptions:
+//
+// ARMv8-a, AArch64
+// Neon Available.
+//
+
+// Arguments and results.
+#define srcin x0
+#define cntin x1
+#define chrin w2
+
+#define result x0
+
+#define src x3
+#define tmp x4
+#define wtmp2 w5
+#define synd x6
+#define soff x9
+#define cntrem x10
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_chr1 v3
+#define vhas_chr2 v4
+#define vrepmask v5
+#define vend v6
+
+//
+// Core algorithm:
+//
+// For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+// per byte. For each tuple, bit 0 is set if the relevant byte matched the
+// requested character and bit 1 is not used (faster than using a 32bit
+// syndrome). Since the bits in the syndrome reflect exactly the order in which
+// things occur in the original string, counting trailing zeros allows to
+// identify exactly which byte has matched.
+//
+
+ASM_GLOBAL ASM_PFX(InternalMemScanMem8)
+ASM_PFX(InternalMemScanMem8):
+ // Do not dereference srcin if no bytes to compare.
+ cbz cntin, .Lzero_length
+ //
+ // Magic constant 0x40100401 allows us to identify which lane matches
+ // the requested byte.
+ //
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ // Work with aligned 32-byte chunks
+ bic src, srcin, #31
+ dup vrepmask.4s, wtmp2
+ ands soff, srcin, #31
+ and cntrem, cntin, #31
+ b.eq .Lloop
+
+ //
+ // Input string is not 32-byte aligned. We calculate the syndrome
+ // value for the aligned 32 bytes block containing the first bytes
+ // and mask the irrelevant part.
+ //
+
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ sub tmp, soff, #32
+ adds cntin, cntin, tmp
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ addp vend.16b, vend.16b, vend.16b // 128->64
+ mov synd, vend.d[0]
+ // Clear the soff*2 lower bits
+ lsl tmp, soff, #1
+ lsr synd, synd, tmp
+ lsl synd, synd, tmp
+ // The first block can also be the last
+ b.ls .Lmasklast
+ // Have we found something already?
+ cbnz synd, .Ltail
+
+.Lloop:
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ subs cntin, cntin, #32
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ // If we're out of data we finish regardless of the result
+ b.ls .Lend
+ // Use a fast check for the termination condition
+ orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
+ addp vend.2d, vend.2d, vend.2d
+ mov synd, vend.d[0]
+ // We're not out of data, loop if we haven't found the character
+ cbz synd, .Lloop
+
+.Lend:
+ // Termination condition found, let's calculate the syndrome value
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ addp vend.16b, vend.16b, vend.16b // 128->64
+ mov synd, vend.d[0]
+ // Only do the clear for the last possible block
+ b.hi .Ltail
+
+.Lmasklast:
+ // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits
+ add tmp, cntrem, soff
+ and tmp, tmp, #31
+ sub tmp, tmp, #32
+ neg tmp, tmp, lsl #1
+ lsl synd, synd, tmp
+ lsr synd, synd, tmp
+
+.Ltail:
+ // Count the trailing zeros using bit reversing
+ rbit synd, synd
+ // Compensate the last post-increment
+ sub src, src, #32
+ // Check that we have found a character
+ cmp synd, #0
+ // And count the leading zeros
+ clz synd, synd
+ // Compute the potential result
+ add result, src, synd, lsr #1
+ // Select result or NULL
+ csel result, xzr, result, eq
+ ret
+
+.Lzero_length:
+ mov result, #0
+ ret
|