aboutsummaryrefslogtreecommitdiffstats
path: root/roms/ipxe/src/arch/x86/include/bits/string.h
diff options
context:
space:
mode:
Diffstat (limited to 'roms/ipxe/src/arch/x86/include/bits/string.h')
-rw-r--r--roms/ipxe/src/arch/x86/include/bits/string.h344
1 files changed, 344 insertions, 0 deletions
diff --git a/roms/ipxe/src/arch/x86/include/bits/string.h b/roms/ipxe/src/arch/x86/include/bits/string.h
new file mode 100644
index 000000000..c26fe30d5
--- /dev/null
+++ b/roms/ipxe/src/arch/x86/include/bits/string.h
@@ -0,0 +1,344 @@
+#ifndef X86_BITS_STRING_H
+#define X86_BITS_STRING_H
+
+/*
+ * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ * You can also choose to distribute this program under the terms of
+ * the Unmodified Binary Distribution Licence (as given in the file
+ * COPYING.UBDL), provided that you have satisfied its requirements.
+ */
+
+FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
+
+/** @file
+ *
+ * Optimised string operations
+ *
+ */
+
+extern void * __memcpy ( void *dest, const void *src, size_t len );
+extern void * __memcpy_reverse ( void *dest, const void *src, size_t len );
+
+/**
+ * Copy memory area (where length is a compile-time constant)
+ *
+ * @v dest Destination address
+ * @v src Source address
+ * @v len Length
+ * @ret dest Destination address
+ */
+static inline __attribute__ (( always_inline )) void *
+__constant_memcpy ( void *dest, const void *src, size_t len ) {
+ union {
+ uint32_t u32[2];
+ uint16_t u16[4];
+ uint8_t u8[8];
+ } __attribute__ (( __may_alias__ )) *dest_u = dest;
+ const union {
+ uint32_t u32[2];
+ uint16_t u16[4];
+ uint8_t u8[8];
+ } __attribute__ (( __may_alias__ )) *src_u = src;
+ const void *esi;
+ void *edi;
+
+ switch ( len ) {
+ case 0 : /* 0 bytes */
+ return dest;
+ /*
+ * Single-register moves; these are always better than a
+ * string operation. We can clobber an arbitrary two
+ * registers (data, source, dest can re-use source register)
+ * instead of being restricted to esi and edi. There's also a
+ * much greater potential for optimising with nearby code.
+ *
+ */
+ case 1 : /* 4 bytes */
+ dest_u->u8[0] = src_u->u8[0];
+ return dest;
+ case 2 : /* 6 bytes */
+ dest_u->u16[0] = src_u->u16[0];
+ return dest;
+ case 4 : /* 4 bytes */
+ dest_u->u32[0] = src_u->u32[0];
+ return dest;
+ /*
+ * Double-register moves; these are probably still a win.
+ *
+ */
+ case 3 : /* 12 bytes */
+ dest_u->u16[0] = src_u->u16[0];
+ dest_u->u8[2] = src_u->u8[2];
+ return dest;
+ case 5 : /* 10 bytes */
+ dest_u->u32[0] = src_u->u32[0];
+ dest_u->u8[4] = src_u->u8[4];
+ return dest;
+ case 6 : /* 12 bytes */
+ dest_u->u32[0] = src_u->u32[0];
+ dest_u->u16[2] = src_u->u16[2];
+ return dest;
+ case 8 : /* 10 bytes */
+ dest_u->u32[0] = src_u->u32[0];
+ dest_u->u32[1] = src_u->u32[1];
+ return dest;
+ }
+
+ /* Even if we have to load up esi and edi ready for a string
+ * operation, we can sometimes save space by using multiple
+ * single-byte "movs" operations instead of loading up ecx and
+ * using "rep movsb".
+ *
+ * "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte
+ * to allow for saving/restoring ecx 50% of the time.
+ *
+ * "movsl" and "movsb" are 1 byte each, "movsw" is two bytes.
+ * (In 16-bit mode, "movsl" is 2 bytes and "movsw" is 1 byte,
+ * but "movsl" moves twice as much data, so it balances out).
+ *
+ * The cutoff point therefore occurs around 26 bytes; the byte
+ * requirements for each method are:
+ *
+ * len 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
+ * #bytes (ecx) 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
+ * #bytes (no ecx) 4 5 6 7 5 6 7 8 6 7 8 9 7 8 9 10
+ */
+
+ esi = src;
+ edi = dest;
+
+ if ( len >= 26 )
+ return __memcpy ( dest, src, len );
+
+ if ( len >= 6*4 )
+ __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
+ : "0" ( edi ), "1" ( esi ) : "memory" );
+ if ( len >= 5*4 )
+ __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
+ : "0" ( edi ), "1" ( esi ) : "memory" );
+ if ( len >= 4*4 )
+ __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
+ : "0" ( edi ), "1" ( esi ) : "memory" );
+ if ( len >= 3*4 )
+ __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
+ : "0" ( edi ), "1" ( esi ) : "memory" );
+ if ( len >= 2*4 )
+ __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
+ : "0" ( edi ), "1" ( esi ) : "memory" );
+ if ( len >= 1*4 )
+ __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
+ : "0" ( edi ), "1" ( esi ) : "memory" );
+ if ( ( len % 4 ) >= 2 )
+ __asm__ __volatile__ ( "movsw" : "=&D" ( edi ), "=&S" ( esi )
+ : "0" ( edi ), "1" ( esi ) : "memory" );
+ if ( ( len % 2 ) >= 1 )
+ __asm__ __volatile__ ( "movsb" : "=&D" ( edi ), "=&S" ( esi )
+ : "0" ( edi ), "1" ( esi ) : "memory" );
+
+ return dest;
+}
+
+/**
+ * Copy memory area
+ *
+ * @v dest Destination address
+ * @v src Source address
+ * @v len Length
+ * @ret dest Destination address
+ */
+static inline __attribute__ (( always_inline )) void *
+memcpy ( void *dest, const void *src, size_t len ) {
+ if ( __builtin_constant_p ( len ) ) {
+ return __constant_memcpy ( dest, src, len );
+ } else {
+ return __memcpy ( dest, src, len );
+ }
+}
+
+extern void * __memmove ( void *dest, const void *src, size_t len );
+
+/**
+ * Copy (possibly overlapping) memory area
+ *
+ * @v dest Destination address
+ * @v src Source address
+ * @v len Length
+ * @ret dest Destination address
+ */
+static inline __attribute__ (( always_inline )) void *
+memmove ( void *dest, const void *src, size_t len ) {
+ ssize_t offset = ( dest - src );
+
+ if ( __builtin_constant_p ( offset ) ) {
+ if ( offset <= 0 ) {
+ return memcpy ( dest, src, len );
+ } else {
+ return __memcpy_reverse ( dest, src, len );
+ }
+ } else {
+ return __memmove ( dest, src, len );
+ }
+}
+
+/**
+ * Fill memory region
+ *
+ * @v dest Destination address
+ * @v fill Fill pattern
+ * @v len Length
+ * @ret dest Destination address
+ */
+static inline __attribute__ (( always_inline )) void *
+__memset ( void *dest, int fill, size_t len ) {
+ void *discard_D;
+ size_t discard_c;
+
+ __asm__ __volatile__ ( "rep stosb"
+ : "=&D" ( discard_D ), "=&c" ( discard_c )
+ : "0" ( dest ), "1" ( len ), "a" ( fill )
+ : "memory" );
+ return dest;
+}
+
+/**
+ * Fill memory region with zero (where length is a compile-time constant)
+ *
+ * @v dest Destination address
+ * @v len Length
+ * @ret dest Destination address
+ */
+static inline __attribute__ (( always_inline )) void *
+__constant_memset_zero ( void *dest, size_t len ) {
+ union {
+ uint32_t u32[2];
+ uint16_t u16[4];
+ uint8_t u8[8];
+ } __attribute__ (( __may_alias__ )) *dest_u = dest;
+ void *edi;
+ uint32_t eax;
+
+ switch ( len ) {
+ case 0 : /* 0 bytes */
+ return dest;
+
+ /* Single-register moves. Almost certainly better than a
+ * string operation. We can avoid clobbering any registers,
+ * we can reuse a zero that happens to already be in a
+ * register, and we can optimise away the code entirely if the
+ * memset() is used to clear a region which then gets
+ * immediately overwritten.
+ */
+ case 1 : /* 3 bytes */
+ dest_u->u8[0] = 0;
+ return dest;
+ case 2: /* 5 bytes */
+ dest_u->u16[0] = 0;
+ return dest;
+ case 4: /* 6 bytes */
+ dest_u->u32[0] = 0;
+ return dest;
+
+ /* Double-register moves. Very probably better than a string
+ * operation.
+ */
+ case 3 : /* 9 bytes */
+ dest_u->u16[0] = 0;
+ dest_u->u8[2] = 0;
+ return dest;
+ case 5 : /* 10 bytes */
+ dest_u->u32[0] = 0;
+ dest_u->u8[4] = 0;
+ return dest;
+ case 6 : /* 12 bytes */
+ dest_u->u32[0] = 0;
+ dest_u->u16[2] = 0;
+ return dest;
+ case 8 : /* 13 bytes */
+ dest_u->u32[0] = 0;
+ dest_u->u32[1] = 0;
+ return dest;
+ }
+
+ /* As with memcpy(), we can potentially save space by using
+ * multiple single-byte "stos" instructions instead of loading
+ * up ecx and using "rep stosb".
+ *
+ * "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte
+ * to allow for saving/restoring ecx 50% of the time.
+ *
+ * "stosl" and "stosb" are 1 byte each, "stosw" is two bytes.
+ *
+ * The calculations are therefore the same as for memcpy(),
+ * giving a cutoff point of around 26 bytes.
+ */
+
+ edi = dest;
+ eax = 0;
+
+ if ( len >= 26 )
+ return __memset ( dest, 0, len );
+
+ if ( len >= 6*4 )
+ __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
+ : "0" ( edi ), "1" ( eax ) : "memory" );
+ if ( len >= 5*4 )
+ __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
+ : "0" ( edi ), "1" ( eax ) : "memory" );
+ if ( len >= 4*4 )
+ __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
+ : "0" ( edi ), "1" ( eax ) : "memory" );
+ if ( len >= 3*4 )
+ __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
+ : "0" ( edi ), "1" ( eax ) : "memory" );
+ if ( len >= 2*4 )
+ __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
+ : "0" ( edi ), "1" ( eax ) : "memory" );
+ if ( len >= 1*4 )
+ __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
+ : "0" ( edi ), "1" ( eax ) : "memory" );
+ if ( ( len % 4 ) >= 2 )
+ __asm__ __volatile__ ( "stosw" : "=&D" ( edi ), "=&a" ( eax )
+ : "0" ( edi ), "1" ( eax ) : "memory" );
+ if ( ( len % 2 ) >= 1 )
+ __asm__ __volatile__ ( "stosb" : "=&D" ( edi ), "=&a" ( eax )
+ : "0" ( edi ), "1" ( eax ) : "memory" );
+
+ return dest;
+}
+
+/**
+ * Fill memory region
+ *
+ * @v dest Destination address
+ * @v fill Fill pattern
+ * @v len Length
+ * @ret dest Destination address
+ */
+static inline __attribute__ (( always_inline )) void *
+memset ( void *dest, int fill, size_t len ) {
+
+ if ( __builtin_constant_p ( fill ) && ( fill == 0 ) &&
+ __builtin_constant_p ( len ) ) {
+ return __constant_memset_zero ( dest, len );
+ } else {
+ return __memset ( dest, fill, len );
+ }
+}
+
+#endif /* X86_BITS_STRING_H */