diff options
Diffstat (limited to 'roms/ipxe/src/arch/x86/include/bits/string.h')
-rw-r--r-- | roms/ipxe/src/arch/x86/include/bits/string.h | 344 |
1 files changed, 344 insertions, 0 deletions
diff --git a/roms/ipxe/src/arch/x86/include/bits/string.h b/roms/ipxe/src/arch/x86/include/bits/string.h new file mode 100644 index 000000000..c26fe30d5 --- /dev/null +++ b/roms/ipxe/src/arch/x86/include/bits/string.h @@ -0,0 +1,344 @@ +#ifndef X86_BITS_STRING_H +#define X86_BITS_STRING_H + +/* + * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * You can also choose to distribute this program under the terms of + * the Unmodified Binary Distribution Licence (as given in the file + * COPYING.UBDL), provided that you have satisfied its requirements. + */ + +FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL ); + +/** @file + * + * Optimised string operations + * + */ + +extern void * __memcpy ( void *dest, const void *src, size_t len ); +extern void * __memcpy_reverse ( void *dest, const void *src, size_t len ); + +/** + * Copy memory area (where length is a compile-time constant) + * + * @v dest Destination address + * @v src Source address + * @v len Length + * @ret dest Destination address + */ +static inline __attribute__ (( always_inline )) void * +__constant_memcpy ( void *dest, const void *src, size_t len ) { + union { + uint32_t u32[2]; + uint16_t u16[4]; + uint8_t u8[8]; + } __attribute__ (( __may_alias__ )) *dest_u = dest; + const union { + uint32_t u32[2]; + uint16_t u16[4]; + uint8_t u8[8]; + } __attribute__ (( __may_alias__ )) *src_u = src; + const void *esi; + void *edi; + + switch ( len ) { + case 0 : /* 0 bytes */ + return dest; + /* + * Single-register moves; these are always better than a + * string operation. We can clobber an arbitrary two + * registers (data, source, dest can re-use source register) + * instead of being restricted to esi and edi. There's also a + * much greater potential for optimising with nearby code. + * + */ + case 1 : /* 4 bytes */ + dest_u->u8[0] = src_u->u8[0]; + return dest; + case 2 : /* 6 bytes */ + dest_u->u16[0] = src_u->u16[0]; + return dest; + case 4 : /* 4 bytes */ + dest_u->u32[0] = src_u->u32[0]; + return dest; + /* + * Double-register moves; these are probably still a win. + * + */ + case 3 : /* 12 bytes */ + dest_u->u16[0] = src_u->u16[0]; + dest_u->u8[2] = src_u->u8[2]; + return dest; + case 5 : /* 10 bytes */ + dest_u->u32[0] = src_u->u32[0]; + dest_u->u8[4] = src_u->u8[4]; + return dest; + case 6 : /* 12 bytes */ + dest_u->u32[0] = src_u->u32[0]; + dest_u->u16[2] = src_u->u16[2]; + return dest; + case 8 : /* 10 bytes */ + dest_u->u32[0] = src_u->u32[0]; + dest_u->u32[1] = src_u->u32[1]; + return dest; + } + + /* Even if we have to load up esi and edi ready for a string + * operation, we can sometimes save space by using multiple + * single-byte "movs" operations instead of loading up ecx and + * using "rep movsb". + * + * "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte + * to allow for saving/restoring ecx 50% of the time. + * + * "movsl" and "movsb" are 1 byte each, "movsw" is two bytes. + * (In 16-bit mode, "movsl" is 2 bytes and "movsw" is 1 byte, + * but "movsl" moves twice as much data, so it balances out). + * + * The cutoff point therefore occurs around 26 bytes; the byte + * requirements for each method are: + * + * len 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + * #bytes (ecx) 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 + * #bytes (no ecx) 4 5 6 7 5 6 7 8 6 7 8 9 7 8 9 10 + */ + + esi = src; + edi = dest; + + if ( len >= 26 ) + return __memcpy ( dest, src, len ); + + if ( len >= 6*4 ) + __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi ) + : "0" ( edi ), "1" ( esi ) : "memory" ); + if ( len >= 5*4 ) + __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi ) + : "0" ( edi ), "1" ( esi ) : "memory" ); + if ( len >= 4*4 ) + __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi ) + : "0" ( edi ), "1" ( esi ) : "memory" ); + if ( len >= 3*4 ) + __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi ) + : "0" ( edi ), "1" ( esi ) : "memory" ); + if ( len >= 2*4 ) + __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi ) + : "0" ( edi ), "1" ( esi ) : "memory" ); + if ( len >= 1*4 ) + __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi ) + : "0" ( edi ), "1" ( esi ) : "memory" ); + if ( ( len % 4 ) >= 2 ) + __asm__ __volatile__ ( "movsw" : "=&D" ( edi ), "=&S" ( esi ) + : "0" ( edi ), "1" ( esi ) : "memory" ); + if ( ( len % 2 ) >= 1 ) + __asm__ __volatile__ ( "movsb" : "=&D" ( edi ), "=&S" ( esi ) + : "0" ( edi ), "1" ( esi ) : "memory" ); + + return dest; +} + +/** + * Copy memory area + * + * @v dest Destination address + * @v src Source address + * @v len Length + * @ret dest Destination address + */ +static inline __attribute__ (( always_inline )) void * +memcpy ( void *dest, const void *src, size_t len ) { + if ( __builtin_constant_p ( len ) ) { + return __constant_memcpy ( dest, src, len ); + } else { + return __memcpy ( dest, src, len ); + } +} + +extern void * __memmove ( void *dest, const void *src, size_t len ); + +/** + * Copy (possibly overlapping) memory area + * + * @v dest Destination address + * @v src Source address + * @v len Length + * @ret dest Destination address + */ +static inline __attribute__ (( always_inline )) void * +memmove ( void *dest, const void *src, size_t len ) { + ssize_t offset = ( dest - src ); + + if ( __builtin_constant_p ( offset ) ) { + if ( offset <= 0 ) { + return memcpy ( dest, src, len ); + } else { + return __memcpy_reverse ( dest, src, len ); + } + } else { + return __memmove ( dest, src, len ); + } +} + +/** + * Fill memory region + * + * @v dest Destination address + * @v fill Fill pattern + * @v len Length + * @ret dest Destination address + */ +static inline __attribute__ (( always_inline )) void * +__memset ( void *dest, int fill, size_t len ) { + void *discard_D; + size_t discard_c; + + __asm__ __volatile__ ( "rep stosb" + : "=&D" ( discard_D ), "=&c" ( discard_c ) + : "0" ( dest ), "1" ( len ), "a" ( fill ) + : "memory" ); + return dest; +} + +/** + * Fill memory region with zero (where length is a compile-time constant) + * + * @v dest Destination address + * @v len Length + * @ret dest Destination address + */ +static inline __attribute__ (( always_inline )) void * +__constant_memset_zero ( void *dest, size_t len ) { + union { + uint32_t u32[2]; + uint16_t u16[4]; + uint8_t u8[8]; + } __attribute__ (( __may_alias__ )) *dest_u = dest; + void *edi; + uint32_t eax; + + switch ( len ) { + case 0 : /* 0 bytes */ + return dest; + + /* Single-register moves. Almost certainly better than a + * string operation. We can avoid clobbering any registers, + * we can reuse a zero that happens to already be in a + * register, and we can optimise away the code entirely if the + * memset() is used to clear a region which then gets + * immediately overwritten. + */ + case 1 : /* 3 bytes */ + dest_u->u8[0] = 0; + return dest; + case 2: /* 5 bytes */ + dest_u->u16[0] = 0; + return dest; + case 4: /* 6 bytes */ + dest_u->u32[0] = 0; + return dest; + + /* Double-register moves. Very probably better than a string + * operation. + */ + case 3 : /* 9 bytes */ + dest_u->u16[0] = 0; + dest_u->u8[2] = 0; + return dest; + case 5 : /* 10 bytes */ + dest_u->u32[0] = 0; + dest_u->u8[4] = 0; + return dest; + case 6 : /* 12 bytes */ + dest_u->u32[0] = 0; + dest_u->u16[2] = 0; + return dest; + case 8 : /* 13 bytes */ + dest_u->u32[0] = 0; + dest_u->u32[1] = 0; + return dest; + } + + /* As with memcpy(), we can potentially save space by using + * multiple single-byte "stos" instructions instead of loading + * up ecx and using "rep stosb". + * + * "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte + * to allow for saving/restoring ecx 50% of the time. + * + * "stosl" and "stosb" are 1 byte each, "stosw" is two bytes. + * + * The calculations are therefore the same as for memcpy(), + * giving a cutoff point of around 26 bytes. + */ + + edi = dest; + eax = 0; + + if ( len >= 26 ) + return __memset ( dest, 0, len ); + + if ( len >= 6*4 ) + __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax ) + : "0" ( edi ), "1" ( eax ) : "memory" ); + if ( len >= 5*4 ) + __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax ) + : "0" ( edi ), "1" ( eax ) : "memory" ); + if ( len >= 4*4 ) + __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax ) + : "0" ( edi ), "1" ( eax ) : "memory" ); + if ( len >= 3*4 ) + __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax ) + : "0" ( edi ), "1" ( eax ) : "memory" ); + if ( len >= 2*4 ) + __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax ) + : "0" ( edi ), "1" ( eax ) : "memory" ); + if ( len >= 1*4 ) + __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax ) + : "0" ( edi ), "1" ( eax ) : "memory" ); + if ( ( len % 4 ) >= 2 ) + __asm__ __volatile__ ( "stosw" : "=&D" ( edi ), "=&a" ( eax ) + : "0" ( edi ), "1" ( eax ) : "memory" ); + if ( ( len % 2 ) >= 1 ) + __asm__ __volatile__ ( "stosb" : "=&D" ( edi ), "=&a" ( eax ) + : "0" ( edi ), "1" ( eax ) : "memory" ); + + return dest; +} + +/** + * Fill memory region + * + * @v dest Destination address + * @v fill Fill pattern + * @v len Length + * @ret dest Destination address + */ +static inline __attribute__ (( always_inline )) void * +memset ( void *dest, int fill, size_t len ) { + + if ( __builtin_constant_p ( fill ) && ( fill == 0 ) && + __builtin_constant_p ( len ) ) { + return __constant_memset_zero ( dest, len ); + } else { + return __memset ( dest, fill, len ); + } +} + +#endif /* X86_BITS_STRING_H */ |