diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-05-19 17:36:08 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-05-19 17:36:08 -0700 |
commit | df48d8716eab9608fe93924e4ae06ff110e8674f (patch) | |
tree | 0fe10733a414b3651e1dae29518b7960a4da0aa4 /arch/x86/lib/memcpy_64.S | |
parent | acd30250d7d0f495685d1c7c6184636a22fcdf7f (diff) | |
parent | 29510ec3b626c86de9707bb8904ff940d430289b (diff) |
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (107 commits)
perf stat: Add more cache-miss percentage printouts
perf stat: Add -d -d and -d -d -d options to show more CPU events
ftrace/kbuild: Add recordmcount files to force full build
ftrace: Add self-tests for multiple function trace users
ftrace: Modify ftrace_set_filter/notrace to take ops
ftrace: Allow dynamically allocated function tracers
ftrace: Implement separate user function filtering
ftrace: Free hash with call_rcu_sched()
ftrace: Have global_ops store the functions that are to be traced
ftrace: Add ops parameter to ftrace_startup/shutdown functions
ftrace: Add enabled_functions file
ftrace: Use counters to enable functions to trace
ftrace: Separate hash allocation and assignment
ftrace: Create a global_ops to hold the filter and notrace hashes
ftrace: Use hash instead for FTRACE_FL_FILTER
ftrace: Replace FTRACE_FL_NOTRACE flag with a hash of ignored functions
perf bench, x86: Add alternatives-asm.h wrapper
x86, 64-bit: Fix copy_[to/from]_user() checks for the userspace address limit
x86, mem: memset_64.S: Optimize memset by enhanced REP MOVSB/STOSB
x86, mem: memmove_64.S: Optimize memmove by enhanced REP MOVSB/STOSB
...
Diffstat (limited to 'arch/x86/lib/memcpy_64.S')
-rw-r--r-- | arch/x86/lib/memcpy_64.S | 45 |
1 files changed, 32 insertions, 13 deletions
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 75ef61e35e3..daab21dae2d 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -4,6 +4,7 @@ #include <asm/cpufeature.h> #include <asm/dwarf2.h> +#include <asm/alternative-asm.h> /* * memcpy - Copy a memory block. @@ -37,6 +38,23 @@ .Lmemcpy_e: .previous +/* + * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than + * memcpy_c. Use memcpy_c_e when possible. + * + * This gets patched over the unrolled variant (below) via the + * alternative instructions framework: + */ + .section .altinstr_replacement, "ax", @progbits +.Lmemcpy_c_e: + movq %rdi, %rax + + movl %edx, %ecx + rep movsb + ret +.Lmemcpy_e_e: + .previous + ENTRY(__memcpy) ENTRY(memcpy) CFI_STARTPROC @@ -171,21 +189,22 @@ ENDPROC(memcpy) ENDPROC(__memcpy) /* - * Some CPUs run faster using the string copy instructions. - * It is also a lot simpler. Use this when possible: - */ - - .section .altinstructions, "a" - .align 8 - .quad memcpy - .quad .Lmemcpy_c - .word X86_FEATURE_REP_GOOD - - /* + * Some CPUs are adding enhanced REP MOVSB/STOSB feature + * If the feature is supported, memcpy_c_e() is the first choice. + * If enhanced rep movsb copy is not available, use fast string copy + * memcpy_c() when possible. This is faster and code is simpler than + * original memcpy(). + * Otherwise, original memcpy() is used. + * In .altinstructions section, ERMS feature is placed after REG_GOOD + * feature to implement the right patch order. + * * Replace only beginning, memcpy is used to apply alternatives, * so it is silly to overwrite itself with nops - reboot is the * only outcome... */ - .byte .Lmemcpy_e - .Lmemcpy_c - .byte .Lmemcpy_e - .Lmemcpy_c + .section .altinstructions, "a" + altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ + .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c + altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ + .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e .previous |