aboutsummaryrefslogtreecommitdiffstats
path: root/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm
diff options
context:
space:
mode:
authorAngelos Mouzakitis <a.mouzakitis@virtualopensystems.com>2023-10-10 14:33:42 +0000
committerAngelos Mouzakitis <a.mouzakitis@virtualopensystems.com>2023-10-10 14:33:42 +0000
commitaf1a266670d040d2f4083ff309d732d648afba2a (patch)
tree2fc46203448ddcc6f81546d379abfaeb323575e9 /roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm
parente02cda008591317b1625707ff8e115a4841aa889 (diff)
Add submodule dependency filesHEADmaster
Change-Id: Iaf8d18082d3991dec7c0ebbea540f092188eb4ec
Diffstat (limited to 'roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm')
-rw-r--r--roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareGuid.S44
-rw-r--r--roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareGuid.asm48
-rw-r--r--roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.S117
-rw-r--r--roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.asm118
-rw-r--r--roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.S170
-rw-r--r--roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.asm141
-rw-r--r--roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/MemLibGuid.c159
-rw-r--r--roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.S122
-rw-r--r--roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.asm121
-rw-r--r--roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMemGeneric.c136
-rw-r--r--roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.S88
-rw-r--r--roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.asm90
12 files changed, 1354 insertions, 0 deletions
diff --git a/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareGuid.S b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareGuid.S
new file mode 100644
index 000000000..7de70ae34
--- /dev/null
+++ b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareGuid.S
@@ -0,0 +1,44 @@
+//
+// Copyright (c) 2016, Linaro Limited
+// All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause-Patent
+//
+
+ .text
+ .thumb
+ .syntax unified
+ .align 5
+ .type ASM_PFX(InternalMemCompareGuid), %function
+ASM_GLOBAL ASM_PFX(InternalMemCompareGuid)
+ASM_PFX(InternalMemCompareGuid):
+ push {r4, lr}
+ ldr r2, [r0]
+ ldr r3, [r0, #4]
+ ldr r4, [r0, #8]
+ ldr r0, [r0, #12]
+ cbz r1, 1f
+ ldr ip, [r1]
+ ldr lr, [r1, #4]
+ cmp r2, ip
+ it eq
+ cmpeq.n r3, lr
+ beq 0f
+ movs r0, #0
+ pop {r4, pc}
+
+0: ldr r2, [r1, #8]
+ ldr r3, [r1, #12]
+ cmp r4, r2
+ it eq
+ cmpeq.n r0, r3
+ bne 2f
+ movs r0, #1
+ pop {r4, pc}
+
+1: orrs r2, r2, r3
+ orrs r4, r4, r0
+ movs r0, #1
+ orrs r2, r2, r4
+2: it ne
+ movne.n r0, #0
+ pop {r4, pc}
diff --git a/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareGuid.asm b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareGuid.asm
new file mode 100644
index 000000000..3316b8dc6
--- /dev/null
+++ b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareGuid.asm
@@ -0,0 +1,48 @@
+;
+; Copyright (c) 2016, Linaro Limited
+; All rights reserved.
+; SPDX-License-Identifier: BSD-2-Clause-Patent
+;
+
+ EXPORT InternalMemCompareGuid
+ THUMB
+ AREA CompareGuid, CODE, READONLY, CODEALIGN, ALIGN=5
+
+InternalMemCompareGuid
+ push {r4, lr}
+ ldr r2, [r0]
+ ldr r3, [r0, #4]
+ ldr r4, [r0, #8]
+ ldr r0, [r0, #12]
+ cbz r1, L1
+ ldr ip, [r1]
+ ldr lr, [r1, #4]
+ cmp r2, ip
+ it eq
+ cmpeq r3, lr
+ beq L0
+ movs r0, #0
+ pop {r4, pc}
+
+L0
+ ldr r2, [r1, #8]
+ ldr r3, [r1, #12]
+ cmp r4, r2
+ it eq
+ cmpeq r0, r3
+ bne L2
+ movs r0, #1
+ pop {r4, pc}
+
+L1
+ orrs r2, r2, r3
+ orrs r4, r4, r0
+ movs r0, #1
+ orrs r2, r2, r4
+
+L2
+ it ne
+ movne r0, #0
+ pop {r4, pc}
+
+ END
diff --git a/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.S b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.S
new file mode 100644
index 000000000..d32816c7b
--- /dev/null
+++ b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.S
@@ -0,0 +1,117 @@
+//
+// Copyright (c) 2013 - 2016, Linaro Limited
+// All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause-Patent
+//
+
+// Parameters and result.
+#define src1 r0
+#define src2 r1
+#define limit r2
+#define result r0
+
+// Internal variables.
+#define data1 r3
+#define data2 r4
+#define limit_wd r5
+#define diff r6
+#define tmp1 r7
+#define tmp2 r12
+#define pos r8
+#define mask r14
+
+ .text
+ .thumb
+ .syntax unified
+ .align 5
+ .type ASM_PFX(InternalMemCompareMem), %function
+ASM_GLOBAL ASM_PFX(InternalMemCompareMem)
+ASM_PFX(InternalMemCompareMem):
+ push {r4-r8, lr}
+ eor tmp1, src1, src2
+ tst tmp1, #3
+ bne .Lmisaligned4
+ ands tmp1, src1, #3
+ bne .Lmutual_align
+ add limit_wd, limit, #3
+ nop.w
+ lsr limit_wd, limit_wd, #2
+
+ // Start of performance-critical section -- one 32B cache line.
+.Lloop_aligned:
+ ldr data1, [src1], #4
+ ldr data2, [src2], #4
+.Lstart_realigned:
+ subs limit_wd, limit_wd, #1
+ eor diff, data1, data2 // Non-zero if differences found.
+ cbnz diff, 0f
+ bne .Lloop_aligned
+ // End of performance-critical section -- one 32B cache line.
+
+ // Not reached the limit, must have found a diff.
+0: cbnz limit_wd, .Lnot_limit
+
+ // Limit % 4 == 0 => all bytes significant.
+ ands limit, limit, #3
+ beq .Lnot_limit
+
+ lsl limit, limit, #3 // Bits -> bytes.
+ mov mask, #~0
+ lsl mask, mask, limit
+ bic data1, data1, mask
+ bic data2, data2, mask
+
+ orr diff, diff, mask
+
+.Lnot_limit:
+ rev diff, diff
+ rev data1, data1
+ rev data2, data2
+
+ // The MS-non-zero bit of DIFF marks either the first bit
+ // that is different, or the end of the significant data.
+ // Shifting left now will bring the critical information into the
+ // top bits.
+ clz pos, diff
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+
+ // But we need to zero-extend (char is unsigned) the value and then
+ // perform a signed 32-bit subtraction.
+ lsr data1, data1, #28
+ sub result, data1, data2, lsr #28
+ pop {r4-r8, pc}
+
+.Lmutual_align:
+ // Sources are mutually aligned, but are not currently at an
+ // alignment boundary. Round down the addresses and then mask off
+ // the bytes that precede the start point.
+ bic src1, src1, #3
+ bic src2, src2, #3
+ add limit, limit, tmp1 // Adjust the limit for the extra.
+ lsl tmp1, tmp1, #3 // Bytes beyond alignment -> bits.
+ ldr data1, [src1], #4
+ rsb tmp1, tmp1, #32 // Bits to alignment -32.
+ ldr data2, [src2], #4
+ mov tmp2, #~0
+
+ // Little-endian. Early bytes are at LSB.
+ lsr tmp2, tmp2, tmp1 // Shift (tmp1 & 31).
+ add limit_wd, limit, #3
+ orr data1, data1, tmp2
+ orr data2, data2, tmp2
+ lsr limit_wd, limit_wd, #2
+ b .Lstart_realigned
+
+.Lmisaligned4:
+ sub limit, limit, #1
+1:
+ // Perhaps we can do better than this.
+ ldrb data1, [src1], #1
+ ldrb data2, [src2], #1
+ subs limit, limit, #1
+ it cs
+ cmpcs.n data1, data2
+ beq 1b
+ sub result, data1, data2
+ pop {r4-r8, pc}
diff --git a/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.asm b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.asm
new file mode 100644
index 000000000..27a1efc8c
--- /dev/null
+++ b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.asm
@@ -0,0 +1,118 @@
+;
+; Copyright (c) 2013 - 2016, Linaro Limited
+; All rights reserved.
+; SPDX-License-Identifier: BSD-2-Clause-Patent
+;
+
+; Parameters and result.
+#define src1 r0
+#define src2 r1
+#define limit r2
+#define result r0
+
+; Internal variables.
+#define data1 r3
+#define data2 r4
+#define limit_wd r5
+#define diff r6
+#define tmp1 r7
+#define tmp2 r12
+#define pos r8
+#define mask r14
+
+ EXPORT InternalMemCompareMem
+ THUMB
+ AREA CompareMem, CODE, READONLY
+
+InternalMemCompareMem
+ push {r4-r8, lr}
+ eor tmp1, src1, src2
+ tst tmp1, #3
+ bne Lmisaligned4
+ ands tmp1, src1, #3
+ bne Lmutual_align
+ add limit_wd, limit, #3
+ nop.w
+ lsr limit_wd, limit_wd, #2
+
+ ; Start of performance-critical section -- one 32B cache line.
+Lloop_aligned
+ ldr data1, [src1], #4
+ ldr data2, [src2], #4
+Lstart_realigned
+ subs limit_wd, limit_wd, #1
+ eor diff, data1, data2 ; Non-zero if differences found.
+ cbnz diff, L0
+ bne Lloop_aligned
+ ; End of performance-critical section -- one 32B cache line.
+
+ ; Not reached the limit, must have found a diff.
+L0
+ cbnz limit_wd, Lnot_limit
+
+ // Limit % 4 == 0 => all bytes significant.
+ ands limit, limit, #3
+ beq Lnot_limit
+
+ lsl limit, limit, #3 // Bits -> bytes.
+ mov mask, #~0
+ lsl mask, mask, limit
+ bic data1, data1, mask
+ bic data2, data2, mask
+
+ orr diff, diff, mask
+
+Lnot_limit
+ rev diff, diff
+ rev data1, data1
+ rev data2, data2
+
+ ; The MS-non-zero bit of DIFF marks either the first bit
+ ; that is different, or the end of the significant data.
+ ; Shifting left now will bring the critical information into the
+ ; top bits.
+ clz pos, diff
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+
+ ; But we need to zero-extend (char is unsigned) the value and then
+ ; perform a signed 32-bit subtraction.
+ lsr data1, data1, #28
+ sub result, data1, data2, lsr #28
+ pop {r4-r8, pc}
+
+Lmutual_align
+ ; Sources are mutually aligned, but are not currently at an
+ ; alignment boundary. Round down the addresses and then mask off
+ ; the bytes that precede the start point.
+ bic src1, src1, #3
+ bic src2, src2, #3
+ add limit, limit, tmp1 ; Adjust the limit for the extra.
+ lsl tmp1, tmp1, #2 ; Bytes beyond alignment -> bits.
+ ldr data1, [src1], #4
+ neg tmp1, tmp1 ; Bits to alignment -32.
+ ldr data2, [src2], #4
+ mov tmp2, #~0
+
+ ; Little-endian. Early bytes are at LSB.
+ lsr tmp2, tmp2, tmp1 ; Shift (tmp1 & 31).
+ add limit_wd, limit, #3
+ orr data1, data1, tmp2
+ orr data2, data2, tmp2
+ lsr limit_wd, limit_wd, #2
+ b Lstart_realigned
+
+Lmisaligned4
+ sub limit, limit, #1
+L1
+ // Perhaps we can do better than this.
+ ldrb data1, [src1], #1
+ ldrb data2, [src2], #1
+ subs limit, limit, #1
+ it cs
+ cmpcs data1, data2
+ beq L1
+ sub result, data1, data2
+ pop {r4-r8, pc}
+
+ END
diff --git a/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.S b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.S
new file mode 100644
index 000000000..c36af196a
--- /dev/null
+++ b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.S
@@ -0,0 +1,170 @@
+#------------------------------------------------------------------------------
+#
+# CopyMem() worker for ARM
+#
+# This file started out as C code that did 64 bit moves if the buffer was
+# 32-bit aligned, else it does a byte copy. It also does a byte copy for
+# any trailing bytes. It was updated to do 32-byte copies using stm/ldm.
+#
+# Copyright (c) 2008 - 2010, Apple Inc. All rights reserved.<BR>
+# Copyright (c) 2016, Linaro Ltd. All rights reserved.<BR>
+# SPDX-License-Identifier: BSD-2-Clause-Patent
+#
+#------------------------------------------------------------------------------
+
+ .text
+ .thumb
+ .syntax unified
+
+/**
+ Copy Length bytes from Source to Destination. Overlap is OK.
+
+ This implementation
+
+ @param Destination Target of copy
+ @param Source Place to copy from
+ @param Length Number of bytes to copy
+
+ @return Destination
+
+
+VOID *
+EFIAPI
+InternalMemCopyMem (
+ OUT VOID *DestinationBuffer,
+ IN CONST VOID *SourceBuffer,
+ IN UINTN Length
+ )
+**/
+ .type ASM_PFX(InternalMemCopyMem), %function
+ASM_GLOBAL ASM_PFX(InternalMemCopyMem)
+ASM_PFX(InternalMemCopyMem):
+ push {r4-r11, lr}
+ // Save the input parameters in extra registers (r11 = destination, r14 = source, r12 = length)
+ mov r11, r0
+ mov r10, r0
+ mov r12, r2
+ mov r14, r1
+
+ cmp r11, r1
+ // If (dest < source)
+ bcc memcopy_check_optim_default
+
+ // If (source + length < dest)
+ rsb r3, r1, r11
+ cmp r12, r3
+ bcc memcopy_check_optim_default
+ b memcopy_check_optim_overlap
+
+memcopy_check_optim_default:
+ // Check if we can use an optimized path ((length >= 32) && destination word-aligned && source word-aligned) for the memcopy (optimized path if r0 == 1)
+ tst r0, #0xF
+ it ne
+ movne.n r0, #0
+ bne memcopy_default
+ tst r1, #0xF
+ it ne
+ movne.n r3, #0
+ it eq
+ moveq.n r3, #1
+ cmp r2, #31
+ it ls
+ movls.n r0, #0
+ bls memcopy_default
+ and r0, r3, #1
+ b memcopy_default
+
+memcopy_check_optim_overlap:
+ // r10 = dest_end, r14 = source_end
+ add r10, r11, r12
+ add r14, r12, r1
+
+ // Are we in the optimized case ((length >= 32) && dest_end word-aligned && source_end word-aligned)
+ cmp r2, #31
+ it ls
+ movls.n r0, #0
+ it hi
+ movhi.n r0, #1
+ tst r10, #0xF
+ it ne
+ movne.n r0, #0
+ tst r14, #0xF
+ it ne
+ movne.n r0, #0
+ b memcopy_overlapped
+
+memcopy_overlapped_non_optim:
+ // We read 1 byte from the end of the source buffer
+ sub r3, r14, #1
+ sub r12, r12, #1
+ ldrb r3, [r3, #0]
+ sub r2, r10, #1
+ cmp r12, #0
+ // We write 1 byte at the end of the dest buffer
+ sub r10, r10, #1
+ sub r14, r14, #1
+ strb r3, [r2, #0]
+ bne memcopy_overlapped_non_optim
+ b memcopy_end
+
+// r10 = dest_end, r14 = source_end
+memcopy_overlapped:
+ // Are we in the optimized case ?
+ cmp r0, #0
+ beq memcopy_overlapped_non_optim
+
+ // Optimized Overlapped - Read 32 bytes
+ sub r14, r14, #32
+ sub r12, r12, #32
+ cmp r12, #31
+ ldmia r14, {r2-r9}
+
+ // If length is less than 32 then disable optim
+ it ls
+ movls.n r0, #0
+
+ cmp r12, #0
+
+ // Optimized Overlapped - Write 32 bytes
+ sub r10, r10, #32
+ stmia r10, {r2-r9}
+
+ // while (length != 0)
+ bne memcopy_overlapped
+ b memcopy_end
+
+memcopy_default_non_optim:
+ // Byte copy
+ ldrb r3, [r14], #1
+ sub r12, r12, #1
+ strb r3, [r10], #1
+
+memcopy_default:
+ cmp r12, #0
+ beq memcopy_end
+
+// r10 = dest, r14 = source
+memcopy_default_loop:
+ cmp r0, #0
+ beq memcopy_default_non_optim
+
+ // Optimized memcopy - Read 32 Bytes
+ sub r12, r12, #32
+ cmp r12, #31
+ ldmia r14!, {r2-r9}
+
+ // If length is less than 32 then disable optim
+ it ls
+ movls.n r0, #0
+
+ cmp r12, #0
+
+ // Optimized memcopy - Write 32 Bytes
+ stmia r10!, {r2-r9}
+
+ // while (length != 0)
+ bne memcopy_default_loop
+
+memcopy_end:
+ mov r0, r11
+ pop {r4-r11, pc}
diff --git a/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.asm b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.asm
new file mode 100644
index 000000000..3ebcfd5b4
--- /dev/null
+++ b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.asm
@@ -0,0 +1,141 @@
+;------------------------------------------------------------------------------
+;
+; CopyMem() worker for ARM
+;
+; This file started out as C code that did 64 bit moves if the buffer was
+; 32-bit aligned, else it does a byte copy. It also does a byte copy for
+; any trailing bytes. It was updated to do 32-byte copies using stm/ldm.
+;
+; Copyright (c) 2008 - 2010, Apple Inc. All rights reserved.<BR>
+; Copyright (c) 2016, Linaro Ltd. All rights reserved.<BR>
+; SPDX-License-Identifier: BSD-2-Clause-Patent
+;
+;------------------------------------------------------------------------------
+
+ EXPORT InternalMemCopyMem
+ AREA SetMem, CODE, READONLY
+ THUMB
+
+InternalMemCopyMem
+ stmfd sp!, {r4-r11, lr}
+ // Save the input parameters in extra registers (r11 = destination, r14 = source, r12 = length)
+ mov r11, r0
+ mov r10, r0
+ mov r12, r2
+ mov r14, r1
+
+memcopy_check_overlapped
+ cmp r11, r1
+ // If (dest < source)
+ bcc memcopy_check_optim_default
+
+ // If (source + length < dest)
+ rsb r3, r1, r11
+ cmp r12, r3
+ bcc memcopy_check_optim_default
+ b memcopy_check_optim_overlap
+
+memcopy_check_optim_default
+ // Check if we can use an optimized path ((length >= 32) && destination word-aligned && source word-aligned) for the memcopy (optimized path if r0 == 1)
+ tst r0, #0xF
+ movne r0, #0
+ bne memcopy_default
+ tst r1, #0xF
+ movne r3, #0
+ moveq r3, #1
+ cmp r2, #31
+ movls r0, #0
+ andhi r0, r3, #1
+ b memcopy_default
+
+memcopy_check_optim_overlap
+ // r10 = dest_end, r14 = source_end
+ add r10, r11, r12
+ add r14, r12, r1
+
+ // Are we in the optimized case ((length >= 32) && dest_end word-aligned && source_end word-aligned)
+ cmp r2, #31
+ movls r0, #0
+ movhi r0, #1
+ tst r10, #0xF
+ movne r0, #0
+ tst r14, #0xF
+ movne r0, #0
+ b memcopy_overlapped
+
+memcopy_overlapped_non_optim
+ // We read 1 byte from the end of the source buffer
+ sub r3, r14, #1
+ sub r12, r12, #1
+ ldrb r3, [r3, #0]
+ sub r2, r10, #1
+ cmp r12, #0
+ // We write 1 byte at the end of the dest buffer
+ sub r10, r10, #1
+ sub r14, r14, #1
+ strb r3, [r2, #0]
+ bne memcopy_overlapped_non_optim
+ b memcopy_end
+
+// r10 = dest_end, r14 = source_end
+memcopy_overlapped
+ // Are we in the optimized case ?
+ cmp r0, #0
+ beq memcopy_overlapped_non_optim
+
+ // Optimized Overlapped - Read 32 bytes
+ sub r14, r14, #32
+ sub r12, r12, #32
+ cmp r12, #31
+ ldmia r14, {r2-r9}
+
+ // If length is less than 32 then disable optim
+ movls r0, #0
+
+ cmp r12, #0
+
+ // Optimized Overlapped - Write 32 bytes
+ sub r10, r10, #32
+ stmia r10, {r2-r9}
+
+ // while (length != 0)
+ bne memcopy_overlapped
+ b memcopy_end
+
+memcopy_default_non_optim
+ // Byte copy
+ ldrb r3, [r14], #1
+ sub r12, r12, #1
+ strb r3, [r10], #1
+
+memcopy_default
+ cmp r12, #0
+ beq memcopy_end
+
+// r10 = dest, r14 = source
+memcopy_default_loop
+ cmp r0, #0
+ beq memcopy_default_non_optim
+
+ // Optimized memcopy - Read 32 Bytes
+ sub r12, r12, #32
+ cmp r12, #31
+ ldmia r14!, {r2-r9}
+
+ // If length is less than 32 then disable optim
+ movls r0, #0
+
+ cmp r12, #0
+
+ // Optimized memcopy - Write 32 Bytes
+ stmia r10!, {r2-r9}
+
+ // while (length != 0)
+ bne memcopy_default_loop
+
+memcopy_end
+ mov r0, r11
+ ldmfd sp!, {r4-r11, pc}
+
+ END
+
diff --git a/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/MemLibGuid.c b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/MemLibGuid.c
new file mode 100644
index 000000000..f27b95c11
--- /dev/null
+++ b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/MemLibGuid.c
@@ -0,0 +1,159 @@
+/** @file
+ Implementation of GUID functions for ARM and AARCH64
+
+ Copyright (c) 2006 - 2016, Intel Corporation. All rights reserved.<BR>
+ Copyright (c) 2016, Linaro Ltd. All rights reserved.<BR>
+ SPDX-License-Identifier: BSD-2-Clause-Patent
+
+**/
+
+#include "MemLibInternals.h"
+
+/**
+ Internal function to compare two GUIDs.
+
+ This function compares Guid1 to Guid2. If the GUIDs are identical then TRUE is returned.
+ If there are any bit differences in the two GUIDs, then FALSE is returned.
+
+ @param Guid1 A pointer to a 128 bit GUID.
+ @param Guid2 A pointer to a 128 bit GUID.
+
+ @retval TRUE Guid1 and Guid2 are identical.
+ @retval FALSE Guid1 and Guid2 are not identical.
+
+**/
+BOOLEAN
+EFIAPI
+InternalMemCompareGuid (
+ IN CONST GUID *Guid1,
+ IN CONST GUID *Guid2
+ );
+
+/**
+ Copies a source GUID to a destination GUID.
+
+ This function copies the contents of the 128-bit GUID specified by SourceGuid to
+ DestinationGuid, and returns DestinationGuid.
+
+ If DestinationGuid is NULL, then ASSERT().
+ If SourceGuid is NULL, then ASSERT().
+
+ @param DestinationGuid The pointer to the destination GUID.
+ @param SourceGuid The pointer to the source GUID.
+
+ @return DestinationGuid.
+
+**/
+GUID *
+EFIAPI
+CopyGuid (
+ OUT GUID *DestinationGuid,
+ IN CONST GUID *SourceGuid
+ )
+{
+ ASSERT (DestinationGuid != NULL);
+ ASSERT (SourceGuid != NULL);
+
+ return InternalMemCopyMem (DestinationGuid, SourceGuid, sizeof (GUID));
+}
+
+/**
+ Compares two GUIDs.
+
+ This function compares Guid1 to Guid2. If the GUIDs are identical then TRUE is returned.
+ If there are any bit differences in the two GUIDs, then FALSE is returned.
+
+ If Guid1 is NULL, then ASSERT().
+ If Guid2 is NULL, then ASSERT().
+
+ @param Guid1 A pointer to a 128 bit GUID.
+ @param Guid2 A pointer to a 128 bit GUID.
+
+ @retval TRUE Guid1 and Guid2 are identical.
+ @retval FALSE Guid1 and Guid2 are not identical.
+
+**/
+BOOLEAN
+EFIAPI
+CompareGuid (
+ IN CONST GUID *Guid1,
+ IN CONST GUID *Guid2
+ )
+{
+ ASSERT (Guid1 != NULL);
+ ASSERT (Guid2 != NULL);
+
+ return InternalMemCompareGuid (Guid1, Guid2);
+}
+
+/**
+ Scans a target buffer for a GUID, and returns a pointer to the matching GUID
+ in the target buffer.
+
+ This function searches the target buffer specified by Buffer and Length from
+ the lowest address to the highest address at 128-bit increments for the 128-bit
+ GUID value that matches Guid. If a match is found, then a pointer to the matching
+ GUID in the target buffer is returned. If no match is found, then NULL is returned.
+ If Length is 0, then NULL is returned.
+
+ If Length > 0 and Buffer is NULL, then ASSERT().
+ If Buffer is not aligned on a 32-bit boundary, then ASSERT().
+ If Length is not aligned on a 128-bit boundary, then ASSERT().
+ If Length is greater than (MAX_ADDRESS - Buffer + 1), then ASSERT().
+
+ @param Buffer The pointer to the target buffer to scan.
+ @param Length The number of bytes in Buffer to scan.
+ @param Guid The value to search for in the target buffer.
+
+ @return A pointer to the matching Guid in the target buffer or NULL otherwise.
+
+**/
+VOID *
+EFIAPI
+ScanGuid (
+ IN CONST VOID *Buffer,
+ IN UINTN Length,
+ IN CONST GUID *Guid
+ )
+{
+ CONST GUID *GuidPtr;
+
+ ASSERT (((UINTN)Buffer & (sizeof (Guid->Data1) - 1)) == 0);
+ ASSERT (Length <= (MAX_ADDRESS - (UINTN)Buffer + 1));
+ ASSERT ((Length & (sizeof (*GuidPtr) - 1)) == 0);
+
+ GuidPtr = (GUID*)Buffer;
+ Buffer = GuidPtr + Length / sizeof (*GuidPtr);
+ while (GuidPtr < (CONST GUID*)Buffer) {
+ if (InternalMemCompareGuid (GuidPtr, Guid)) {
+ return (VOID*)GuidPtr;
+ }
+ GuidPtr++;
+ }
+ return NULL;
+}
+
+/**
+ Checks if the given GUID is a zero GUID.
+
+ This function checks whether the given GUID is a zero GUID. If the GUID is
+ identical to a zero GUID then TRUE is returned. Otherwise, FALSE is returned.
+
+ If Guid is NULL, then ASSERT().
+
+ @param Guid The pointer to a 128 bit GUID.
+
+ @retval TRUE Guid is a zero GUID.
+ @retval FALSE Guid is not a zero GUID.
+
+**/
+BOOLEAN
+EFIAPI
+IsZeroGuid (
+ IN CONST GUID *Guid
+ )
+{
+ ASSERT (Guid != NULL);
+
+ return InternalMemCompareGuid (Guid, NULL);
+}
diff --git a/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.S b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.S
new file mode 100644
index 000000000..c0b2dee5d
--- /dev/null
+++ b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.S
@@ -0,0 +1,122 @@
+// Copyright (c) 2010-2011, Linaro Limited
+// All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause-Patent
+//
+
+//
+// Written by Dave Gilbert <david.gilbert@linaro.org>
+//
+// This memchr routine is optimised on a Cortex-A9 and should work on
+// all ARMv7 processors. It has a fast past for short sizes, and has
+// an optimised path for large data sets; the worst case is finding the
+// match early in a large data set.
+//
+
+
+// 2011-02-07 david.gilbert@linaro.org
+// Extracted from local git a5b438d861
+// 2011-07-14 david.gilbert@linaro.org
+// Import endianness fix from local git ea786f1b
+// 2011-12-07 david.gilbert@linaro.org
+// Removed unneeded cbz from align loop
+
+// this lets us check a flag in a 00/ff byte easily in either endianness
+#define CHARTSTMASK(c) 1<<(c*8)
+
+ .text
+ .thumb
+ .syntax unified
+
+ .type ASM_PFX(InternalMemScanMem8), %function
+ASM_GLOBAL ASM_PFX(InternalMemScanMem8)
+ASM_PFX(InternalMemScanMem8):
+ // r0 = start of memory to scan
+ // r1 = length
+ // r2 = character to look for
+ // returns r0 = pointer to character or NULL if not found
+ uxtb r2, r2 // Don't think we can trust the caller to actually pass a char
+
+ cmp r1, #16 // If it's short don't bother with anything clever
+ blt 20f
+
+ tst r0, #7 // If it's already aligned skip the next bit
+ beq 10f
+
+ // Work up to an aligned point
+5:
+ ldrb r3, [r0],#1
+ subs r1, r1, #1
+ cmp r3, r2
+ beq 50f // If it matches exit found
+ tst r0, #7
+ bne 5b // If not aligned yet then do next byte
+
+10:
+ // At this point, we are aligned, we know we have at least 8 bytes to work with
+ push {r4-r7}
+ orr r2, r2, r2, lsl #8 // expand the match word across to all bytes
+ orr r2, r2, r2, lsl #16
+ bic r4, r1, #7 // Number of double words to work with
+ mvns r7, #0 // all F's
+ movs r3, #0
+
+15:
+ ldmia r0!, {r5,r6}
+ subs r4, r4, #8
+ eor r5, r5, r2 // Get it so that r5,r6 have 00's where the bytes match the target
+ eor r6, r6, r2
+ uadd8 r5, r5, r7 // Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r5, r3, r7 // bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ uadd8 r6, r6, r7 // Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r6, r5, r7 // chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ cbnz r6, 60f
+ bne 15b // (Flags from the subs above) If not run out of bytes then go around again
+
+ pop {r4-r7}
+ and r2, r2, #0xff // Get r2 back to a single character from the expansion above
+ and r1, r1, #7 // Leave the count remaining as the number after the double words have been done
+
+20:
+ cbz r1, 40f // 0 length or hit the end already then not found
+
+21: // Post aligned section, or just a short call
+ ldrb r3, [r0], #1
+ subs r1, r1, #1
+ eor r3, r3, r2 // r3 = 0 if match - doesn't break flags from sub
+ cbz r3, 50f
+ bne 21b // on r1 flags
+
+40:
+ movs r0, #0 // not found
+ bx lr
+
+50:
+ subs r0, r0, #1 // found
+ bx lr
+
+60: // We're here because the fast path found a hit - now we have to track down exactly which word it was
+ // r0 points to the start of the double word after the one that was tested
+ // r5 has the 00/ff pattern for the first word, r6 has the chained value
+ subs r0, r0, #3
+ cmp r5, #0
+ it eq
+ moveq.n r5, r6 // the end is in the 2nd word
+ it ne
+ subne.n r0, r0, #4 // or 2nd byte of 1st word
+
+ // r0 currently points to the 3rd byte of the word containing the hit
+ tst r5, #CHARTSTMASK(0) // 1st character
+ bne 61f
+ adds r0, r0, #1
+ tst r5, #CHARTSTMASK(1) // 2nd character
+ bne 61f
+ adds r0, r0 ,#1
+ tst r5, #(3 << 15) // 2nd & 3rd character
+ // If not the 3rd must be the last one
+ it eq
+ addeq.n r0, r0, #1
+
+61:
+ pop {r4-r7}
+ subs r0, r0, #1
+ bx lr
diff --git a/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.asm b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.asm
new file mode 100644
index 000000000..00f4bff19
--- /dev/null
+++ b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.asm
@@ -0,0 +1,121 @@
+; Copyright (c) 2010-2011, Linaro Limited
+; All rights reserved.
+; SPDX-License-Identifier: BSD-2-Clause-Patent
+;
+
+;
+; Written by Dave Gilbert <david.gilbert@linaro.org>
+;
+; This memchr routine is optimised on a Cortex-A9 and should work on
+; all ARMv7 processors. It has a fast past for short sizes, and has
+; an optimised path for large data sets; the worst case is finding the
+; match early in a large data set.
+;
+
+
+; 2011-02-07 david.gilbert@linaro.org
+; Extracted from local git a5b438d861
+; 2011-07-14 david.gilbert@linaro.org
+; Import endianness fix from local git ea786f1b
+; 2011-12-07 david.gilbert@linaro.org
+; Removed unneeded cbz from align loop
+
+; this lets us check a flag in a 00/ff byte easily in either endianness
+#define CHARTSTMASK(c) 1<<(c*8)
+
+ EXPORT InternalMemScanMem8
+ AREA ScanMem, CODE, READONLY
+ THUMB
+
+InternalMemScanMem8
+ ; r0 = start of memory to scan
+ ; r1 = length
+ ; r2 = character to look for
+ ; returns r0 = pointer to character or NULL if not found
+ uxtb r2, r2 ; Don't think we can trust the caller to actually pass a char
+
+ cmp r1, #16 ; If it's short don't bother with anything clever
+ blt L20
+
+ tst r0, #7 ; If it's already aligned skip the next bit
+ beq L10
+
+ ; Work up to an aligned point
+L5
+ ldrb r3, [r0],#1
+ subs r1, r1, #1
+ cmp r3, r2
+ beq L50 ; If it matches exit found
+ tst r0, #7
+ bne L5 ; If not aligned yet then do next byte
+
+L10
+ ; At this point, we are aligned, we know we have at least 8 bytes to work with
+ push {r4-r7}
+ orr r2, r2, r2, lsl #8 ; expand the match word across to all bytes
+ orr r2, r2, r2, lsl #16
+ bic r4, r1, #7 ; Number of double words to work with
+ mvns r7, #0 ; all F's
+ movs r3, #0
+
+L15
+ ldmia r0!, {r5,r6}
+ subs r4, r4, #8
+ eor r5, r5, r2 ; Get it so that r5,r6 have 00's where the bytes match the target
+ eor r6, r6, r2
+ uadd8 r5, r5, r7 ; Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r5, r3, r7 ; bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ uadd8 r6, r6, r7 ; Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r6, r5, r7 ; chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ cbnz r6, L60
+ bne L15 ; (Flags from the subs above) If not run out of bytes then go around again
+
+ pop {r4-r7}
+ and r2, r2, #0xff ; Get r2 back to a single character from the expansion above
+ and r1, r1, #7 ; Leave the count remaining as the number after the double words have been done
+
+L20
+ cbz r1, L40 ; 0 length or hit the end already then not found
+
+L21 ; Post aligned section, or just a short call
+ ldrb r3, [r0], #1
+ subs r1, r1, #1
+ eor r3, r3, r2 ; r3 = 0 if match - doesn't break flags from sub
+ cbz r3, L50
+ bne L21 ; on r1 flags
+
+L40
+ movs r0, #0 ; not found
+ bx lr
+
+L50
+ subs r0, r0, #1 ; found
+ bx lr
+
+L60 ; We're here because the fast path found a hit - now we have to track down exactly which word it was
+ ; r0 points to the start of the double word after the one that was tested
+ ; r5 has the 00/ff pattern for the first word, r6 has the chained value
+ cmp r5, #0
+ itte eq
+ moveq r5, r6 ; the end is in the 2nd word
+ subeq r0, r0, #3 ; Points to 2nd byte of 2nd word
+ subne r0, r0, #7 ; or 2nd byte of 1st word
+
+ ; r0 currently points to the 3rd byte of the word containing the hit
+ tst r5, #CHARTSTMASK(0) ; 1st character
+ bne L61
+ adds r0, r0, #1
+ tst r5, #CHARTSTMASK(1) ; 2nd character
+ ittt eq
+ addeq r0, r0 ,#1
+ tsteq r5, #(3 << 15) ; 2nd & 3rd character
+ ; If not the 3rd must be the last one
+ addeq r0, r0, #1
+
+L61
+ pop {r4-r7}
+ subs r0, r0, #1
+ bx lr
+
+ END
+
diff --git a/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMemGeneric.c b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMemGeneric.c
new file mode 100644
index 000000000..e0b4f2707
--- /dev/null
+++ b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMemGeneric.c
@@ -0,0 +1,136 @@
+/** @file
+ Architecture Independent Base Memory Library Implementation.
+
+ The following BaseMemoryLib instances contain the same copy of this file:
+ BaseMemoryLib
+ PeiMemoryLib
+ UefiMemoryLib
+
+ Copyright (c) 2006 - 2016, Intel Corporation. All rights reserved.<BR>
+ SPDX-License-Identifier: BSD-2-Clause-Patent
+
+**/
+
+#include "../MemLibInternals.h"
+
+/**
+ Scans a target buffer for a 16-bit value, and returns a pointer to the
+ matching 16-bit value in the target buffer.
+
+ @param Buffer The pointer to the target buffer to scan.
+ @param Length The count of 16-bit value to scan. Must be non-zero.
+ @param Value The value to search for in the target buffer.
+
+ @return The pointer to the first occurrence, or NULL if not found.
+
+**/
+CONST VOID *
+EFIAPI
+InternalMemScanMem16 (
+ IN CONST VOID *Buffer,
+ IN UINTN Length,
+ IN UINT16 Value
+ )
+{
+ CONST UINT16 *Pointer;
+
+ Pointer = (CONST UINT16*)Buffer;
+ do {
+ if (*Pointer == Value) {
+ return Pointer;
+ }
+ ++Pointer;
+ } while (--Length != 0);
+ return NULL;
+}
+
+/**
+ Scans a target buffer for a 32-bit value, and returns a pointer to the
+ matching 32-bit value in the target buffer.
+
+ @param Buffer The pointer to the target buffer to scan.
+ @param Length The count of 32-bit value to scan. Must be non-zero.
+ @param Value The value to search for in the target buffer.
+
+ @return The pointer to the first occurrence, or NULL if not found.
+
+**/
+CONST VOID *
+EFIAPI
+InternalMemScanMem32 (
+ IN CONST VOID *Buffer,
+ IN UINTN Length,
+ IN UINT32 Value
+ )
+{
+ CONST UINT32 *Pointer;
+
+ Pointer = (CONST UINT32*)Buffer;
+ do {
+ if (*Pointer == Value) {
+ return Pointer;
+ }
+ ++Pointer;
+ } while (--Length != 0);
+ return NULL;
+}
+
+/**
+ Scans a target buffer for a 64-bit value, and returns a pointer to the
+ matching 64-bit value in the target buffer.
+
+ @param Buffer The pointer to the target buffer to scan.
+ @param Length The count of 64-bit value to scan. Must be non-zero.
+ @param Value The value to search for in the target buffer.
+
+ @return The pointer to the first occurrence, or NULL if not found.
+
+**/
+CONST VOID *
+EFIAPI
+InternalMemScanMem64 (
+ IN CONST VOID *Buffer,
+ IN UINTN Length,
+ IN UINT64 Value
+ )
+{
+ CONST UINT64 *Pointer;
+
+ Pointer = (CONST UINT64*)Buffer;
+ do {
+ if (*Pointer == Value) {
+ return Pointer;
+ }
+ ++Pointer;
+ } while (--Length != 0);
+ return NULL;
+}
+
+/**
+ Checks whether the contents of a buffer are all zeros.
+
+ @param Buffer The pointer to the buffer to be checked.
+ @param Length The size of the buffer (in bytes) to be checked.
+
+ @retval TRUE Contents of the buffer are all zeros.
+ @retval FALSE Contents of the buffer are not all zeros.
+
+**/
+BOOLEAN
+EFIAPI
+InternalMemIsZeroBuffer (
+ IN CONST VOID *Buffer,
+ IN UINTN Length
+ )
+{
+ CONST UINT8 *BufferData;
+ UINTN Index;
+
+ BufferData = Buffer;
+ for (Index = 0; Index < Length; Index++) {
+ if (BufferData[Index] != 0) {
+ return FALSE;
+ }
+ }
+ return TRUE;
+}
diff --git a/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.S b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.S
new file mode 100644
index 000000000..f6797004e
--- /dev/null
+++ b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.S
@@ -0,0 +1,88 @@
+#------------------------------------------------------------------------------
+#
+# Copyright (c) 2016, Linaro Ltd. All rights reserved.<BR>
+#
+# SPDX-License-Identifier: BSD-2-Clause-Patent
+#
+#------------------------------------------------------------------------------
+
+ .text
+ .thumb
+ .syntax unified
+ .align 5
+ .type ASM_PFX(InternalMemSetMem16), %function
+ASM_GLOBAL ASM_PFX(InternalMemSetMem16)
+ASM_PFX(InternalMemSetMem16):
+ uxth r2, r2
+ lsl r1, r1, #1
+ orr r2, r2, r2, lsl #16
+ b 0f
+
+ .type ASM_PFX(InternalMemSetMem32), %function
+ASM_GLOBAL ASM_PFX(InternalMemSetMem32)
+ASM_PFX(InternalMemSetMem32):
+ lsl r1, r1, #2
+ b 0f
+
+ .type ASM_PFX(InternalMemSetMem64), %function
+ASM_GLOBAL ASM_PFX(InternalMemSetMem64)
+ASM_PFX(InternalMemSetMem64):
+ lsl r1, r1, #3
+ b 1f
+
+ .align 5
+ .type ASM_PFX(InternalMemSetMem), %function
+ASM_GLOBAL ASM_PFX(InternalMemSetMem)
+ASM_PFX(InternalMemSetMem):
+ uxtb r2, r2
+ orr r2, r2, r2, lsl #8
+ orr r2, r2, r2, lsl #16
+ b 0f
+
+ .type ASM_PFX(InternalMemZeroMem), %function
+ASM_GLOBAL ASM_PFX(InternalMemZeroMem)
+ASM_PFX(InternalMemZeroMem):
+ movs r2, #0
+0: mov r3, r2
+
+1: push {r4, lr}
+ cmp r1, #16 // fewer than 16 bytes of input?
+ add r1, r1, r0 // r1 := dst + length
+ add lr, r0, #16
+ blt 2f
+ bic lr, lr, #15 // align output pointer
+
+ str r2, [r0] // potentially unaligned store of 4 bytes
+ str r3, [r0, #4] // potentially unaligned store of 4 bytes
+ str r2, [r0, #8] // potentially unaligned store of 4 bytes
+ str r3, [r0, #12] // potentially unaligned store of 4 bytes
+ beq 1f
+
+0: add lr, lr, #16 // advance the output pointer by 16 bytes
+ subs r4, r1, lr // past the output?
+ blt 3f // break out of the loop
+ strd r2, r3, [lr, #-16] // aligned store of 16 bytes
+ strd r2, r3, [lr, #-8]
+ bne 0b // goto beginning of loop
+1: pop {r4, pc}
+
+2: subs r4, r1, lr
+3: adds r4, r4, #16
+ subs r1, r1, #8
+ cmp r4, #4 // between 4 and 15 bytes?
+ blt 4f
+ cmp r4, #8 // between 8 and 15 bytes?
+ sub r4, lr, #16
+ str r2, [r4] // overlapping store of 4 + (4 + 4) + 4 bytes
+ it gt
+ strgt.n r3, [r4, #4]
+ it gt
+ strgt.n r2, [r1]
+ str r3, [r1, #4]
+ pop {r4, pc}
+
+4: cmp r4, #2 // 2 or 3 bytes?
+ strb r2, [lr, #-16] // store 1 byte
+ it ge
+ strhge.n r2, [r1, #6] // store 2 bytes
+ pop {r4, pc}
diff --git a/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.asm b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.asm
new file mode 100644
index 000000000..9aee8a0cf
--- /dev/null
+++ b/roms/edk2/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.asm
@@ -0,0 +1,90 @@
+;------------------------------------------------------------------------------
+;
+; Copyright (c) 2016, Linaro Ltd. All rights reserved.<BR>
+;
+; SPDX-License-Identifier: BSD-2-Clause-Patent
+;
+;------------------------------------------------------------------------------
+
+ EXPORT InternalMemZeroMem
+ EXPORT InternalMemSetMem
+ EXPORT InternalMemSetMem16
+ EXPORT InternalMemSetMem32
+ EXPORT InternalMemSetMem64
+
+ AREA SetMem, CODE, READONLY, CODEALIGN, ALIGN=5
+ THUMB
+
+InternalMemSetMem16
+ uxth r2, r2
+ lsl r1, r1, #1
+ orr r2, r2, r2, lsl #16
+ b B0
+
+InternalMemSetMem32
+ lsl r1, r1, #2
+ b B0
+
+InternalMemSetMem64
+ lsl r1, r1, #3
+ b B1
+
+ ALIGN 32
+InternalMemSetMem
+ uxtb r2, r2
+ orr r2, r2, r2, lsl #8
+ orr r2, r2, r2, lsl #16
+ b B0
+
+InternalMemZeroMem
+ movs r2, #0
+B0
+ mov r3, r2
+
+B1
+ push {r4, lr}
+ cmp r1, #16 ; fewer than 16 bytes of input?
+ add r1, r1, r0 ; r1 := dst + length
+ add lr, r0, #16
+ blt L2
+ bic lr, lr, #15 ; align output pointer
+
+ str r2, [r0] ; potentially unaligned store of 4 bytes
+ str r3, [r0, #4] ; potentially unaligned store of 4 bytes
+ str r2, [r0, #8] ; potentially unaligned store of 4 bytes
+ str r3, [r0, #12] ; potentially unaligned store of 4 bytes
+ beq L1
+
+L0
+ add lr, lr, #16 ; advance the output pointer by 16 bytes
+ subs r4, r1, lr ; past the output?
+ blt L3 ; break out of the loop
+ strd r2, r3, [lr, #-16] ; aligned store of 16 bytes
+ strd r2, r3, [lr, #-8]
+ bne L0 ; goto beginning of loop
+L1
+ pop {r4, pc}
+
+L2
+ subs r4, r1, lr
+L3
+ adds r4, r4, #16
+ subs r1, r1, #8
+ cmp r4, #4 ; between 4 and 15 bytes?
+ blt L4
+ cmp r4, #8 ; between 8 and 15 bytes?
+ str r2, [lr, #-16] ; overlapping store of 4 + (4 + 4) + 4 bytes
+ itt gt
+ strgt r3, [lr, #-12]
+ strgt r2, [r1]
+ str r3, [r1, #4]
+ pop {r4, pc}
+
+L4
+ cmp r4, #2 ; 2 or 3 bytes?
+ strb r2, [lr, #-16] ; store 1 byte
+ it ge
+ strhge r2, [r1, #6] ; store 2 bytes
+ pop {r4, pc}
+
+ END