[arm] Add optimised string functions for 64-bit ARM
Signed-off-by: Michael Brown <mcb30@ipxe.org>
This commit is contained in:
parent
a966570dce
commit
95716ece91
|
@ -0,0 +1,249 @@
|
|||
/*
|
||||
* Copyright (C) 2016 Michael Brown <mbrown@fensystems.co.uk>.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License as
|
||||
* published by the Free Software Foundation; either version 2 of the
|
||||
* License, or any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
||||
* 02110-1301, USA.
|
||||
*
|
||||
* You can also choose to distribute this program under the terms of
|
||||
* the Unmodified Binary Distribution Licence (as given in the file
|
||||
* COPYING.UBDL), provided that you have satisfied its requirements.
|
||||
*/
|
||||
|
||||
/** @file
|
||||
*
|
||||
* Optimised string operations
|
||||
*
|
||||
*/
|
||||
|
||||
FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
|
||||
|
||||
#include <string.h>
|
||||
|
||||
/**
|
||||
* Copy memory area
|
||||
*
|
||||
* @v dest Destination address
|
||||
* @v src Source address
|
||||
* @v len Length
|
||||
* @ret dest Destination address
|
||||
*/
|
||||
void arm64_memcpy ( void *dest, const void *src, size_t len ) {
|
||||
void *discard_dest;
|
||||
void *discard_end;
|
||||
const void *discard_src;
|
||||
size_t discard_offset;
|
||||
unsigned long discard_data;
|
||||
unsigned long discard_low;
|
||||
unsigned long discard_high;
|
||||
|
||||
/* If length is too short for an "ldp"/"stp" instruction pair,
|
||||
* then just copy individual bytes.
|
||||
*/
|
||||
if ( len < 16 ) {
|
||||
__asm__ __volatile__ ( "cbz %0, 2f\n\t"
|
||||
"\n1:\n\t"
|
||||
"sub %0, %0, #1\n\t"
|
||||
"ldrb %w1, [%3, %0]\n\t"
|
||||
"strb %w1, [%2, %0]\n\t"
|
||||
"cbnz %0, 1b\n\t"
|
||||
"\n2:\n\t"
|
||||
: "=&r" ( discard_offset ),
|
||||
"=&r" ( discard_data )
|
||||
: "r" ( dest ), "r" ( src ), "0" ( len )
|
||||
: "memory" );
|
||||
return;
|
||||
}
|
||||
|
||||
/* Use "ldp"/"stp" to copy 16 bytes at a time: one initial
|
||||
* potentially unaligned access, multiple destination-aligned
|
||||
* accesses, one final potentially unaligned access.
|
||||
*/
|
||||
__asm__ __volatile__ ( "ldp %3, %4, [%1], #16\n\t"
|
||||
"stp %3, %4, [%0], #16\n\t"
|
||||
"and %3, %0, #15\n\t"
|
||||
"sub %0, %0, %3\n\t"
|
||||
"sub %1, %1, %3\n\t"
|
||||
"bic %2, %5, #15\n\t"
|
||||
"b 2f\n\t"
|
||||
"\n1:\n\t"
|
||||
"ldp %3, %4, [%1], #16\n\t"
|
||||
"stp %3, %4, [%0], #16\n\t"
|
||||
"\n2:\n\t"
|
||||
"cmp %0, %2\n\t"
|
||||
"bne 1b\n\t"
|
||||
"ldp %3, %4, [%6, #-16]\n\t"
|
||||
"stp %3, %4, [%5, #-16]\n\t"
|
||||
: "=&r" ( discard_dest ),
|
||||
"=&r" ( discard_src ),
|
||||
"=&r" ( discard_end ),
|
||||
"=&r" ( discard_low ),
|
||||
"=&r" ( discard_high )
|
||||
: "r" ( dest + len ), "r" ( src + len ),
|
||||
"0" ( dest ), "1" ( src )
|
||||
: "memory", "cc" );
|
||||
}
|
||||
|
||||
/**
|
||||
* Zero memory region
|
||||
*
|
||||
* @v dest Destination region
|
||||
* @v len Length
|
||||
*/
|
||||
void arm64_bzero ( void *dest, size_t len ) {
|
||||
size_t discard_offset;
|
||||
void *discard_dest;
|
||||
void *discard_end;
|
||||
|
||||
/* If length is too short for an "stp" instruction, then just
|
||||
* zero individual bytes.
|
||||
*/
|
||||
if ( len < 16 ) {
|
||||
__asm__ __volatile__ ( "cbz %0, 2f\n\t"
|
||||
"\n1:\n\t"
|
||||
"sub %0, %0, #1\n\t"
|
||||
"strb wzr, [%1, %0]\n\t"
|
||||
"cbnz %0, 1b\n\t"
|
||||
"\n2:\n\t"
|
||||
: "=&r" ( discard_offset )
|
||||
: "r" ( dest ), "0" ( len )
|
||||
: "memory" );
|
||||
return;
|
||||
}
|
||||
|
||||
/* Use "stp" to zero 16 bytes at a time: one initial
|
||||
* potentially unaligned access, multiple aligned accesses,
|
||||
* one final potentially unaligned access.
|
||||
*/
|
||||
__asm__ __volatile__ ( "stp xzr, xzr, [%0], #16\n\t"
|
||||
"bic %0, %0, #15\n\t"
|
||||
"bic %1, %2, #15\n\t"
|
||||
"b 2f\n\t"
|
||||
"\n1:\n\t"
|
||||
"stp xzr, xzr, [%0], #16\n\t"
|
||||
"\n2:\n\t"
|
||||
"cmp %0, %1\n\t"
|
||||
"bne 1b\n\t"
|
||||
"stp xzr, xzr, [%2, #-16]\n\t"
|
||||
: "=&r" ( discard_dest ),
|
||||
"=&r" ( discard_end )
|
||||
: "r" ( dest + len ), "0" ( dest )
|
||||
: "memory", "cc" );
|
||||
}
|
||||
|
||||
/**
|
||||
* Fill memory region
|
||||
*
|
||||
* @v dest Destination region
|
||||
* @v len Length
|
||||
* @v character Fill character
|
||||
*
|
||||
* The unusual parameter order is to allow for more efficient
|
||||
* tail-calling to arm64_memset() when zeroing a region.
|
||||
*/
|
||||
void arm64_memset ( void *dest, size_t len, int character ) {
|
||||
size_t discard_offset;
|
||||
|
||||
/* Use optimised zeroing code if applicable */
|
||||
if ( character == 0 ) {
|
||||
arm64_bzero ( dest, len );
|
||||
return;
|
||||
}
|
||||
|
||||
/* Fill one byte at a time. Calling memset() with a non-zero
|
||||
* value is relatively rare and unlikely to be
|
||||
* performance-critical.
|
||||
*/
|
||||
__asm__ __volatile__ ( "cbz %0, 2f\n\t"
|
||||
"\n1:\n\t"
|
||||
"sub %0, %0, #1\n\t"
|
||||
"strb %w2, [%1, %0]\n\t"
|
||||
"cbnz %0, 1b\n\t"
|
||||
"\n2:\n\t"
|
||||
: "=&r" ( discard_offset )
|
||||
: "r" ( dest ), "r" ( character ), "0" ( len )
|
||||
: "memory" );
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy (possibly overlapping) memory region forwards
|
||||
*
|
||||
* @v dest Destination region
|
||||
* @v src Source region
|
||||
* @v len Length
|
||||
*/
|
||||
void arm64_memmove_forwards ( void *dest, const void *src, size_t len ) {
|
||||
void *discard_dest;
|
||||
const void *discard_src;
|
||||
unsigned long discard_data;
|
||||
|
||||
/* Assume memmove() is not performance-critical, and perform a
|
||||
* bytewise copy for simplicity.
|
||||
*/
|
||||
__asm__ __volatile__ ( "b 2f\n\t"
|
||||
"\n1:\n\t"
|
||||
"ldrb %w2, [%1], #1\n\t"
|
||||
"strb %w2, [%0], #1\n\t"
|
||||
"\n2:\n\t"
|
||||
"cmp %0, %3\n\t"
|
||||
"bne 1b\n\t"
|
||||
: "=&r" ( discard_dest ),
|
||||
"=&r" ( discard_src ),
|
||||
"=&r" ( discard_data )
|
||||
: "r" ( dest + len ), "0" ( dest ), "1" ( src )
|
||||
: "memory" );
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy (possibly overlapping) memory region backwards
|
||||
*
|
||||
* @v dest Destination region
|
||||
* @v src Source region
|
||||
* @v len Length
|
||||
*/
|
||||
void arm64_memmove_backwards ( void *dest, const void *src, size_t len ) {
|
||||
size_t discard_offset;
|
||||
unsigned long discard_data;
|
||||
|
||||
/* Assume memmove() is not performance-critical, and perform a
|
||||
* bytewise copy for simplicity.
|
||||
*/
|
||||
__asm__ __volatile__ ( "cbz %0, 2f\n\t"
|
||||
"\n1:\n\t"
|
||||
"sub %0, %0, #1\n\t"
|
||||
"ldrb %w1, [%3, %0]\n\t"
|
||||
"strb %w1, [%2, %0]\n\t"
|
||||
"cbnz %0, 1b\n\t"
|
||||
"\n2:\n\t"
|
||||
: "=&r" ( discard_offset ),
|
||||
"=&r" ( discard_data )
|
||||
: "r" ( dest ), "r" ( src ), "0" ( len )
|
||||
: "memory" );
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy (possibly overlapping) memory region
|
||||
*
|
||||
* @v dest Destination region
|
||||
* @v src Source region
|
||||
* @v len Length
|
||||
*/
|
||||
void arm64_memmove ( void *dest, const void *src, size_t len ) {
|
||||
|
||||
if ( dest <= src ) {
|
||||
arm64_memmove_forwards ( dest, src, len );
|
||||
} else {
|
||||
arm64_memmove_backwards ( dest, src, len );
|
||||
}
|
||||
}
|
|
@ -0,0 +1,106 @@
|
|||
#ifndef BITS_STRING_H
|
||||
#define BITS_STRING_H
|
||||
|
||||
FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
|
||||
|
||||
/** @file
|
||||
*
|
||||
* String functions
|
||||
*
|
||||
*/
|
||||
|
||||
extern void arm64_bzero ( void *dest, size_t len );
|
||||
extern void arm64_memset ( void *dest, size_t len, int character );
|
||||
extern void arm64_memcpy ( void *dest, const void *src, size_t len );
|
||||
extern void arm64_memmove_forwards ( void *dest, const void *src, size_t len );
|
||||
extern void arm64_memmove_backwards ( void *dest, const void *src, size_t len );
|
||||
extern void arm64_memmove ( void *dest, const void *src, size_t len );
|
||||
|
||||
/**
|
||||
* Fill memory region
|
||||
*
|
||||
* @v dest Destination region
|
||||
* @v character Fill character
|
||||
* @v len Length
|
||||
* @ret dest Destination region
|
||||
*/
|
||||
static inline __attribute__ (( always_inline )) void *
|
||||
memset ( void *dest, int character, size_t len ) {
|
||||
|
||||
/* Allow gcc to generate inline "stX xzr" instructions for
|
||||
* small, constant lengths.
|
||||
*/
|
||||
if ( __builtin_constant_p ( character ) && ( character == 0 ) &&
|
||||
__builtin_constant_p ( len ) && ( len <= 64 ) ) {
|
||||
__builtin_memset ( dest, 0, len );
|
||||
return dest;
|
||||
}
|
||||
|
||||
/* For zeroing larger or non-constant lengths, use the
|
||||
* optimised variable-length zeroing code.
|
||||
*/
|
||||
if ( __builtin_constant_p ( character ) && ( character == 0 ) ) {
|
||||
arm64_bzero ( dest, len );
|
||||
return dest;
|
||||
}
|
||||
|
||||
/* Not necessarily zeroing: use basic variable-length code */
|
||||
arm64_memset ( dest, len, character );
|
||||
return dest;
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy memory region
|
||||
*
|
||||
* @v dest Destination region
|
||||
* @v src Source region
|
||||
* @v len Length
|
||||
* @ret dest Destination region
|
||||
*/
|
||||
static inline __attribute__ (( always_inline )) void *
|
||||
memcpy ( void *dest, const void *src, size_t len ) {
|
||||
|
||||
/* Allow gcc to generate inline "ldX"/"stX" instructions for
|
||||
* small, constant lengths.
|
||||
*/
|
||||
if ( __builtin_constant_p ( len ) && ( len <= 64 ) ) {
|
||||
__builtin_memcpy ( dest, src, len );
|
||||
return dest;
|
||||
}
|
||||
|
||||
/* Otherwise, use variable-length code */
|
||||
arm64_memcpy ( dest, src, len );
|
||||
return dest;
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy (possibly overlapping) memory region
|
||||
*
|
||||
* @v dest Destination region
|
||||
* @v src Source region
|
||||
* @v len Length
|
||||
* @ret dest Destination region
|
||||
*/
|
||||
static inline __attribute__ (( always_inline )) void *
|
||||
memmove ( void *dest, const void *src, size_t len ) {
|
||||
ssize_t offset = ( dest - src );
|
||||
|
||||
/* If required direction of copy is known at build time, then
|
||||
* use the appropriate forwards/backwards copy directly.
|
||||
*/
|
||||
if ( __builtin_constant_p ( offset ) ) {
|
||||
if ( offset <= 0 ) {
|
||||
arm64_memmove_forwards ( dest, src, len );
|
||||
return dest;
|
||||
} else {
|
||||
arm64_memmove_backwards ( dest, src, len );
|
||||
return dest;
|
||||
}
|
||||
}
|
||||
|
||||
/* Otherwise, use ambidirectional copy */
|
||||
arm64_memmove ( dest, src, len );
|
||||
return dest;
|
||||
}
|
||||
|
||||
#endif /* BITS_STRING_H */
|
Reference in New Issue