diff options
-rw-r--r-- | libc/ChangeLog | 46 | ||||
-rw-r--r-- | libc/csu/libc-tls.c | 9 | ||||
-rw-r--r-- | libc/elf/dl-lookup.c | 13 | ||||
-rw-r--r-- | libc/elf/dl-reloc.c | 5 | ||||
-rw-r--r-- | libc/elf/dl-runtime.c | 8 | ||||
-rw-r--r-- | libc/include/libc-symbols.h | 13 | ||||
-rw-r--r-- | libc/math/s_fma.c | 4 | ||||
-rw-r--r-- | libc/math/s_fmaf.c | 4 | ||||
-rw-r--r-- | libc/nptl/ChangeLog | 12 | ||||
-rw-r--r-- | libc/nptl/sysdeps/x86_64/tcb-offsets.sym | 1 | ||||
-rw-r--r-- | libc/nptl/sysdeps/x86_64/tls.h | 80 | ||||
-rw-r--r-- | libc/stdio-common/scanf15.c | 1 | ||||
-rw-r--r-- | libc/stdio-common/scanf17.c | 1 | ||||
-rw-r--r-- | libc/sysdeps/x86_64/dl-trampoline.S | 105 | ||||
-rw-r--r-- | libc/sysdeps/x86_64/multiarch/Versions | 5 | ||||
-rw-r--r-- | libc/sysdeps/x86_64/multiarch/init-arch.c | 10 | ||||
-rw-r--r-- | libc/sysdeps/x86_64/multiarch/init-arch.h | 22 | ||||
-rw-r--r-- | libc/sysdeps/x86_64/multiarch/s_fma.c | 43 | ||||
-rw-r--r-- | libc/sysdeps/x86_64/multiarch/s_fmaf.c | 42 | ||||
-rwxr-xr-x | libc/sysdeps/x86_64/tst-xmmymm.sh | 7 |
20 files changed, 388 insertions, 43 deletions
diff --git a/libc/ChangeLog b/libc/ChangeLog index 60b76547c..0d0120ccb 100644 --- a/libc/ChangeLog +++ b/libc/ChangeLog @@ -1,3 +1,49 @@ +2009-07-29 Ulrich Drepper <drepper@redhat.com> + + * math/s_fma.c: Don't define alias if __fma is a macro. + * math/s_fmaf.c: Likewise. + * sysdeps/x86_64/multiarch/s_fma.c: New file. + * sysdeps/x86_64/multiarch/s_fmaf.c: New file. + Partially based on a patch by H.J. Lu <hongjiu.lu@intel.com>. + + * sysdeps/x86_64/multiarch/init-arch.h (__get_cpu_features): Declare. + (HAS_POPCOUNT, HAS_SSE4_2): Add variants which work outside libc. + New macro HAS_FMA. + * sysdeps/x86_64/multiarch/init-arch.c (__get_cpu_features): New + function. + * include/libc-symbols.h (libm_ifunc): Define. + * sysdeps/x86_64/multiarch/Versions: New file. + + * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Improve CFI. + +2009-07-28 H.J. Lu <hongjiu.lu@intel.com> + + * sysdeps/x86_64/dl-trampoline.S: Properly restore AVX registers. + +2009-07-29 Ulrich Drepper <drepper@redhat.com> + + * elf/dl-runtime.c (_dl_fixup): Indicate before _dl_lookup_symbol_x + call that registers used in calling conventions need to be preserved. + * elf/dl-lookup.c (do_lookup_x): Use RTLD_*_FOREIGN_CALL macros + to preserve register content if necessary. + * sysdeps/x86_64/dl-trampoline.S (_dl_x86_64_save_sse): New function. + (_dl_x86_64_restore_sse): New function. + * sysdeps/x86_64/tst-xmmymm.sh: There is now one more function that + is allowed to modify xmm/ymm registers. + + * stdio-common/scanf15.c: Undefine _LIBC. We want to test from an + application's perspective. + * stdio-common/scanf17.c: Likewise. + +2009-07-28 Ulrich Drepper <drepper@redhat.com> + + * csu/libc-tls.c (__libc_setup_tls) [TLS_TCB_AT_TP]: Don't add TCB + size to memsz. + (init_static_tls) [TLS_TCB_AT_TP]: Add it to GL(dl_tls_static_size) + here. + * elf/dl-reloc.c (_dl_try_allocate_static_tls): Compute freebytes in + two steps to catch bugs. + 2009-07-27 Ulrich Drepper <drepper@redhat.com> * sysdeps/x86_64/tst-xmmymm.sh: Refine testing. The script now diff --git a/libc/csu/libc-tls.c b/libc/csu/libc-tls.c index 0d240ccef..5a4994286 100644 --- a/libc/csu/libc-tls.c +++ b/libc/csu/libc-tls.c @@ -1,5 +1,5 @@ /* Initialization code for TLS in statically linked application. - Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc. + Copyright (C) 2002-2006, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -99,6 +99,9 @@ init_static_tls (size_t memsz, size_t align) surplus that permits dynamic loading of modules with IE-model TLS. */ GL(dl_tls_static_size) = roundup (memsz + GL(dl_tls_static_size), TLS_TCB_ALIGN); +#if TLS_TCB_AT_TP + GL(dl_tls_static_size) += TLS_TCB_SIZE; +#endif GL(dl_tls_static_used) = memsz; /* The alignment requirement for the static TLS block. */ GL(dl_tls_static_align) = align; @@ -211,9 +214,7 @@ __libc_setup_tls (size_t tcbsize, size_t tcbalign) memsz = roundup (memsz, align ?: 1); -#if TLS_TCB_AT_TP - memsz += tcbsize; -#elif TLS_DTV_AT_TP +#if TLS_DTV_AT_TP memsz += tcb_offset; #endif diff --git a/libc/elf/dl-lookup.c b/libc/elf/dl-lookup.c index 1d68d67a3..56724c9b4 100644 --- a/libc/elf/dl-lookup.c +++ b/libc/elf/dl-lookup.c @@ -380,6 +380,10 @@ do_lookup_x (const char *undef_name, uint_fast32_t new_hash, if (size * 3 <= tab->n_elements * 4) { /* Expand the table. */ +#ifdef RTLD_CHECK_FOREIGN_CALL + /* This must not happen during runtime relocations. */ + assert (!RTLD_CHECK_FOREIGN_CALL); +#endif size_t newsize = _dl_higher_prime_number (size + 1); struct unique_sym *newentries = calloc (sizeof (struct unique_sym), newsize); @@ -405,6 +409,11 @@ do_lookup_x (const char *undef_name, uint_fast32_t new_hash, } else { +#ifdef RTLD_CHECK_FOREIGN_CALL + /* This must not happen during runtime relocations. */ + assert (!RTLD_CHECK_FOREIGN_CALL); +#endif + #define INITIAL_NUNIQUE_SYM_TABLE 31 size = INITIAL_NUNIQUE_SYM_TABLE; entries = calloc (sizeof (struct unique_sym), size); @@ -600,6 +609,10 @@ add_dependency (struct link_map *undef_map, struct link_map *map, int flags) unsigned int max = undef_map->l_reldepsmax ? undef_map->l_reldepsmax * 2 : 10; +#ifdef RTLD_PREPARE_FOREIGN_CALL + RTLD_PREPARE_FOREIGN_CALL; +#endif + newp = malloc (sizeof (*newp) + max * sizeof (struct link_map *)); if (newp == NULL) { diff --git a/libc/elf/dl-reloc.c b/libc/elf/dl-reloc.c index 07984d456..d21329816 100644 --- a/libc/elf/dl-reloc.c +++ b/libc/elf/dl-reloc.c @@ -61,7 +61,10 @@ _dl_try_allocate_static_tls (struct link_map *map) size_t n; size_t blsize; - freebytes = GL(dl_tls_static_size) - GL(dl_tls_static_used) - TLS_TCB_SIZE; + freebytes = GL(dl_tls_static_size) - GL(dl_tls_static_used); + if (freebytes < TLS_TCB_SIZE) + goto fail; + freebytes -= TLS_TCB_SIZE; blsize = map->l_tls_blocksize + map->l_tls_firstbyte_offset; if (freebytes < blsize) diff --git a/libc/elf/dl-runtime.c b/libc/elf/dl-runtime.c index 0eb7d4e3b..a52120d12 100644 --- a/libc/elf/dl-runtime.c +++ b/libc/elf/dl-runtime.c @@ -111,6 +111,10 @@ _dl_fixup ( flags |= DL_LOOKUP_GSCOPE_LOCK; } +#ifdef RTLD_ENABLE_FOREIGN_CALL + RTLD_ENABLE_FOREIGN_CALL; +#endif + result = _dl_lookup_symbol_x (strtab + sym->st_name, l, &sym, l->l_scope, version, ELF_RTYPE_CLASS_PLT, flags, NULL); @@ -118,6 +122,10 @@ _dl_fixup ( if (!RTLD_SINGLE_THREAD_P) THREAD_GSCOPE_RESET_FLAG (); +#ifdef RTLD_FINALIZE_FOREIGN_CALL + RTLD_FINALIZE_FOREIGN_CALL; +#endif + /* Currently result contains the base load address (or link map) of the object that defines sym. Now add in the symbol offset. */ diff --git a/libc/include/libc-symbols.h b/libc/include/libc-symbols.h index 76876ac36..7ea089089 100644 --- a/libc/include/libc-symbols.h +++ b/libc/include/libc-symbols.h @@ -1,6 +1,6 @@ /* Support macros for making weak and strong aliases for symbols, and for using symbol sets and linker warnings with GNU ld. - Copyright (C) 1995-1998, 2000-2006, 2008 Free Software Foundation, Inc. + Copyright (C) 1995-1998,2000-2006,2008,2009 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -866,6 +866,17 @@ for linking") } \ __asm__ (".type " #name ", %gnu_indirect_function"); +/* The body of the function is supposed to use __get_cpu_features + which will, if necessary, initialize the data first. */ +#define libm_ifunc(name, expr) \ + extern void *name##_ifunc (void) __asm__ (#name); \ + void *name##_ifunc (void) \ + { \ + __typeof (name) *res = expr; \ + return res; \ + } \ + __asm__ (".type " #name ", %gnu_indirect_function"); + #ifdef HAVE_ASM_SET_DIRECTIVE # define libc_ifunc_hidden_def1(local, name) \ __asm__ (declare_symbol_alias_1_stringify (ASM_GLOBAL_DIRECTIVE) \ diff --git a/libc/math/s_fma.c b/libc/math/s_fma.c index e5ff5a722..476d1fe44 100644 --- a/libc/math/s_fma.c +++ b/libc/math/s_fma.c @@ -1,5 +1,5 @@ /* Compute x * y + z as ternary operation. - Copyright (C) 1997, 2001 Free Software Foundation, Inc. + Copyright (C) 1997, 2001, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. @@ -25,7 +25,9 @@ __fma (double x, double y, double z) { return (x * y) + z; } +#ifndef __fma weak_alias (__fma, fma) +#endif #ifdef NO_LONG_DOUBLE strong_alias (__fma, __fmal) diff --git a/libc/math/s_fmaf.c b/libc/math/s_fmaf.c index caa7f3afe..357296d70 100644 --- a/libc/math/s_fmaf.c +++ b/libc/math/s_fmaf.c @@ -1,5 +1,5 @@ /* Compute x * y + z as ternary operation. - Copyright (C) 1997 Free Software Foundation, Inc. + Copyright (C) 1997, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. @@ -25,4 +25,6 @@ __fmaf (float x, float y, float z) { return (x * y) + z; } +#ifndef __fmaf weak_alias (__fmaf, fmaf) +#endif diff --git a/libc/nptl/ChangeLog b/libc/nptl/ChangeLog index 8f37da793..20031b5ae 100644 --- a/libc/nptl/ChangeLog +++ b/libc/nptl/ChangeLog @@ -1,3 +1,15 @@ +2009-07-29 Ulrich Drepper <drepper@redhat.com> + + * sysdeps/x86_64/tls.h (TLS_TCB_ALIGN): Define explicitly to 32. + + * sysdeps/x86_64/tls.h (tcbhead_t): Add room for SSE registers the + dynamic linker might have to save. + Define RTLD_CHECK_FOREIGN_CALL, RTLD_ENABLE_FOREIGN_CALL, + RTLD_PREPARE_FOREIGN_CALL, and RTLD_FINALIZE_FOREIGN_CALL. Pretty + printing. + + * sysdeps/x86_64/tcb-offsets.sym: Add RTLD_SAVESPACE_SSE. + 2009-07-28 Ulrich Drepper <drepper@redhat.com> * pthread_mutex_lock.c [NO_INCR] (__pthread_mutex_cond_lock_adjust): diff --git a/libc/nptl/sysdeps/x86_64/tcb-offsets.sym b/libc/nptl/sysdeps/x86_64/tcb-offsets.sym index 1c70c6bde..51f35c61c 100644 --- a/libc/nptl/sysdeps/x86_64/tcb-offsets.sym +++ b/libc/nptl/sysdeps/x86_64/tcb-offsets.sym @@ -15,3 +15,4 @@ VGETCPU_CACHE_OFFSET offsetof (tcbhead_t, vgetcpu_cache) #ifndef __ASSUME_PRIVATE_FUTEX PRIVATE_FUTEX offsetof (tcbhead_t, private_futex) #endif +RTLD_SAVESPACE_SSE offsetof (tcbhead_t, rtld_savespace_sse) diff --git a/libc/nptl/sysdeps/x86_64/tls.h b/libc/nptl/sysdeps/x86_64/tls.h index ea89f3b1a..4212038ab 100644 --- a/libc/nptl/sysdeps/x86_64/tls.h +++ b/libc/nptl/sysdeps/x86_64/tls.h @@ -29,6 +29,7 @@ # include <sysdep.h> # include <kernel-features.h> # include <bits/wordsize.h> +# include <xmmintrin.h> /* Type for the dtv. */ @@ -55,16 +56,23 @@ typedef struct uintptr_t stack_guard; uintptr_t pointer_guard; unsigned long int vgetcpu_cache[2]; -#ifndef __ASSUME_PRIVATE_FUTEX +# ifndef __ASSUME_PRIVATE_FUTEX int private_futex; -#else +# else int __unused1; -#endif -#if __WORDSIZE == 64 - int __pad1; -#endif +# endif +# if __WORDSIZE == 64 + int rtld_must_xmm_save; +# endif /* Reservation of some values for the TM ABI. */ void *__private_tm[5]; +# if __WORDSIZE == 64 + long int __unused2; + /* Have space for the post-AVX register size. */ + __m128 rtld_savespace_sse[8][4]; + + void *__padding[8]; +# endif } tcbhead_t; #else /* __ASSEMBLER__ */ @@ -109,7 +117,12 @@ typedef struct # define TLS_TCB_SIZE sizeof (struct pthread) /* Alignment requirements for the TCB. */ -# define TLS_TCB_ALIGN __alignof__ (struct pthread) +//# define TLS_TCB_ALIGN __alignof__ (struct pthread) +// Normally the above would be correct But we have to store post-AVX +// vector registers in the TCB and we want the storage to be aligned. +// unfortunately there isn't yet a type for these values and hence no +// 32-byte alignment requirement. Make this explicit, for now. +# define TLS_TCB_ALIGN 32 /* The TCB can have any size and the memory following the address the thread pointer points to is unspecified. Allocate the TCB there. */ @@ -298,7 +311,7 @@ typedef struct /* Atomic compare and exchange on TLS, returning old value. */ -#define THREAD_ATOMIC_CMPXCHG_VAL(descr, member, newval, oldval) \ +# define THREAD_ATOMIC_CMPXCHG_VAL(descr, member, newval, oldval) \ ({ __typeof (descr->member) __ret; \ __typeof (oldval) __old = (oldval); \ if (sizeof (descr->member) == 4) \ @@ -313,7 +326,7 @@ typedef struct /* Atomic logical and. */ -#define THREAD_ATOMIC_AND(descr, member, val) \ +# define THREAD_ATOMIC_AND(descr, member, val) \ (void) ({ if (sizeof ((descr)->member) == 4) \ asm volatile (LOCK_PREFIX "andl %1, %%fs:%P0" \ :: "i" (offsetof (struct pthread, member)), \ @@ -324,7 +337,7 @@ typedef struct /* Atomic set bit. */ -#define THREAD_ATOMIC_BIT_SET(descr, member, bit) \ +# define THREAD_ATOMIC_BIT_SET(descr, member, bit) \ (void) ({ if (sizeof ((descr)->member) == 4) \ asm volatile (LOCK_PREFIX "orl %1, %%fs:%P0" \ :: "i" (offsetof (struct pthread, member)), \ @@ -334,7 +347,7 @@ typedef struct abort (); }) -#define CALL_THREAD_FCT(descr) \ +# define CALL_THREAD_FCT(descr) \ ({ void *__res; \ asm volatile ("movq %%fs:%P2, %%rdi\n\t" \ "callq *%%fs:%P1" \ @@ -355,18 +368,18 @@ typedef struct /* Set the pointer guard field in the TCB head. */ -#define THREAD_SET_POINTER_GUARD(value) \ +# define THREAD_SET_POINTER_GUARD(value) \ THREAD_SETMEM (THREAD_SELF, header.pointer_guard, value) -#define THREAD_COPY_POINTER_GUARD(descr) \ +# define THREAD_COPY_POINTER_GUARD(descr) \ ((descr)->header.pointer_guard \ = THREAD_GETMEM (THREAD_SELF, header.pointer_guard)) /* Get and set the global scope generation counter in the TCB head. */ -#define THREAD_GSCOPE_FLAG_UNUSED 0 -#define THREAD_GSCOPE_FLAG_USED 1 -#define THREAD_GSCOPE_FLAG_WAIT 2 -#define THREAD_GSCOPE_RESET_FLAG() \ +# define THREAD_GSCOPE_FLAG_UNUSED 0 +# define THREAD_GSCOPE_FLAG_USED 1 +# define THREAD_GSCOPE_FLAG_WAIT 2 +# define THREAD_GSCOPE_RESET_FLAG() \ do \ { int __res; \ asm volatile ("xchgl %0, %%fs:%P1" \ @@ -377,11 +390,40 @@ typedef struct lll_futex_wake (&THREAD_SELF->header.gscope_flag, 1, LLL_PRIVATE); \ } \ while (0) -#define THREAD_GSCOPE_SET_FLAG() \ +# define THREAD_GSCOPE_SET_FLAG() \ THREAD_SETMEM (THREAD_SELF, header.gscope_flag, THREAD_GSCOPE_FLAG_USED) -#define THREAD_GSCOPE_WAIT() \ +# define THREAD_GSCOPE_WAIT() \ GL(dl_wait_lookup_done) () + +# ifdef SHARED +/* Defined in dl-trampoline.S. */ +extern void _dl_x86_64_save_sse (void); +extern void _dl_x86_64_restore_sse (void); + +# define RTLD_CHECK_FOREIGN_CALL \ + (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) != 0) + +# define RTLD_ENABLE_FOREIGN_CALL \ + THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 1) + +# define RTLD_PREPARE_FOREIGN_CALL \ + do if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save)) \ + { \ + _dl_x86_64_save_sse (); \ + THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 0); \ + } \ + while (0) + +# define RTLD_FINALIZE_FOREIGN_CALL \ + do { \ + if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) == 0) \ + _dl_x86_64_restore_sse (); \ + THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 0); \ + } while (0) +# endif + + #endif /* __ASSEMBLER__ */ #endif /* tls.h */ diff --git a/libc/stdio-common/scanf15.c b/libc/stdio-common/scanf15.c index c56715c48..851466b3a 100644 --- a/libc/stdio-common/scanf15.c +++ b/libc/stdio-common/scanf15.c @@ -1,5 +1,6 @@ #undef _GNU_SOURCE #define _XOPEN_SOURCE 600 +#undef _LIBC /* The following macro definitions are a hack. They word around disabling the GNU extension while still using a few internal headers. */ #define u_char unsigned char diff --git a/libc/stdio-common/scanf17.c b/libc/stdio-common/scanf17.c index ee9024f9b..4478a7022 100644 --- a/libc/stdio-common/scanf17.c +++ b/libc/stdio-common/scanf17.c @@ -1,5 +1,6 @@ #undef _GNU_SOURCE #define _XOPEN_SOURCE 600 +#undef _LIBC /* The following macro definitions are a hack. They word around disabling the GNU extension while still using a few internal headers. */ #define u_char unsigned char diff --git a/libc/sysdeps/x86_64/dl-trampoline.S b/libc/sysdeps/x86_64/dl-trampoline.S index 49d239f07..20da6956f 100644 --- a/libc/sysdeps/x86_64/dl-trampoline.S +++ b/libc/sysdeps/x86_64/dl-trampoline.S @@ -61,6 +61,7 @@ _dl_runtime_resolve: cfi_startproc _dl_runtime_profile: + cfi_adjust_cfa_offset(16) # Incorporate PLT /* The La_x86_64_regs data structure pointed to by the fourth paramater must be 16-byte aligned. This must be explicitly enforced. We have the set up a dynamically @@ -68,7 +69,7 @@ _dl_runtime_profile: has a fixed size and preserves the original stack pointer. */ subq $32, %rsp # Allocate the local storage. - cfi_adjust_cfa_offset(48) # Incorporate PLT + cfi_adjust_cfa_offset(32) movq %rbx, (%rsp) cfi_rel_offset(%rbx, 0) @@ -203,49 +204,49 @@ L(no_avx1): vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET)(%rsp), %ymm0 1: vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1 1: vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2 1: vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3 1: vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4 1: vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5 1: vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6 1: vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7 L(no_avx2): @@ -361,13 +362,13 @@ L(no_avx3): vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2 vpmovmskb %xmm2, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0 1: vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2 vpmovmskb %xmm2, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1 L(no_avx4): @@ -390,3 +391,85 @@ L(no_avx4): cfi_endproc .size _dl_runtime_profile, .-_dl_runtime_profile #endif + + +#ifdef SHARED + .globl _dl_x86_64_save_sse + .type _dl_x86_64_save_sse, @function + .align 16 + cfi_startproc +_dl_x86_64_save_sse: +# ifdef HAVE_AVX_SUPPORT + cmpl $0, L(have_avx)(%rip) + jne 1f + movq %rbx, %r11 # Save rbx + movl $1, %eax + cpuid + movq %r11,%rbx # Restore rbx + movl $1, %eax + testl $(1 << 28), %ecx + jne 2f + negl %eax +2: movl %eax, L(have_avx)(%rip) + cmpl $0, %eax + +1: js L(no_avx5) + +# define YMM_SIZE 32 + vmovdqa %ymm0, %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE + vmovdqa %ymm1, %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE + vmovdqa %ymm2, %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE + vmovdqa %ymm3, %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE + vmovdqa %ymm4, %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE + vmovdqa %ymm5, %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE + vmovdqa %ymm6, %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE + vmovdqa %ymm7, %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE + ret +L(no_avx5): +# endif +# define YMM_SIZE 16 + movdqa %xmm0, %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE + movdqa %xmm1, %fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE + movdqa %xmm2, %fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE + movdqa %xmm3, %fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE + movdqa %xmm4, %fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE + movdqa %xmm5, %fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE + movdqa %xmm6, %fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE + movdqa %xmm7, %fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE + ret + cfi_endproc + .size _dl_x86_64_save_sse, .-_dl_x86_64_save_sse + + + .globl _dl_x86_64_restore_sse + .type _dl_x86_64_restore_sse, @function + .align 16 + cfi_startproc +_dl_x86_64_restore_sse: +# ifdef HAVE_AVX_SUPPORT + cmpl $0, L(have_avx)(%rip) + js L(no_avx6) + + vmovdqa %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE, %ymm0 + vmovdqa %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE, %ymm1 + vmovdqa %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE, %ymm2 + vmovdqa %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE, %ymm3 + vmovdqa %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE, %ymm4 + vmovdqa %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE, %ymm5 + vmovdqa %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE, %ymm6 + vmovdqa %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE, %ymm7 + ret +L(no_avx6): +# endif + movdqa %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE, %xmm0 + movdqa %fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE, %xmm1 + movdqa %fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE, %xmm2 + movdqa %fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE, %xmm3 + movdqa %fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE, %xmm4 + movdqa %fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE, %xmm5 + movdqa %fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE, %xmm6 + movdqa %fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE, %xmm7 + ret + cfi_endproc + .size _dl_x86_64_restore_sse, .-_dl_x86_64_restore_sse +#endif diff --git a/libc/sysdeps/x86_64/multiarch/Versions b/libc/sysdeps/x86_64/multiarch/Versions new file mode 100644 index 000000000..59b185ac8 --- /dev/null +++ b/libc/sysdeps/x86_64/multiarch/Versions @@ -0,0 +1,5 @@ +libc { + GLIBC_PRIVATE { + __get_cpu_features; + } +} diff --git a/libc/sysdeps/x86_64/multiarch/init-arch.c b/libc/sysdeps/x86_64/multiarch/init-arch.c index 35fd19af0..49b421eac 100644 --- a/libc/sysdeps/x86_64/multiarch/init-arch.c +++ b/libc/sysdeps/x86_64/multiarch/init-arch.c @@ -86,3 +86,13 @@ __init_cpu_features (void) else __cpu_features.kind = arch_kind_other; } + + +const struct cpu_features * +__get_cpu_features (void) +{ + if (__cpu_features.kind == arch_kind_unknown) + __init_cpu_features (); + + return &__cpu_features; +} diff --git a/libc/sysdeps/x86_64/multiarch/init-arch.h b/libc/sysdeps/x86_64/multiarch/init-arch.h index 48a212741..0151e8b95 100644 --- a/libc/sysdeps/x86_64/multiarch/init-arch.h +++ b/libc/sysdeps/x86_64/multiarch/init-arch.h @@ -54,10 +54,28 @@ extern void __init_cpu_features (void) attribute_hidden; __init_cpu_features (); \ while (0) +/* Used from outside libc.so to get access to the CPU features structure. */ +extern const struct cpu_features *__get_cpu_features (void) + __attribute__ ((const)); + /* Following are the feature tests used throughout libc. */ -#define HAS_POPCOUNT \ +#ifndef NOT_IN_libc +# define HAS_POPCOUNT \ ((__cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 23)) != 0) -#define HAS_SSE4_2 \ +# define HAS_SSE4_2 \ ((__cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 20)) != 0) + +# define HAS_FMA \ + ((__cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 12)) != 0) +#else +# define HAS_POPCOUNT \ + ((__get_cpu_features ()->cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 23)) != 0) + +# define HAS_SSE4_2 \ + ((__get_cpu_features ()->cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 20)) != 0) + +# define HAS_FMA \ + ((__get_cpu_features ()->cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 12)) != 0) +#endif diff --git a/libc/sysdeps/x86_64/multiarch/s_fma.c b/libc/sysdeps/x86_64/multiarch/s_fma.c new file mode 100644 index 000000000..40601e9a6 --- /dev/null +++ b/libc/sysdeps/x86_64/multiarch/s_fma.c @@ -0,0 +1,43 @@ +/* FMA version of fma. + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <config.h> +#include <math.h> +#include <init-arch.h> + +#ifdef HAVE_AVX_SUPPORT + +extern double __fma_sse2 (double x, double y, double z); + + +double +__fma_fma (double x, double y, double z) +{ + asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); + return x; +} + +libm_ifunc (__fma, HAS_FMA ? __fma_fma : __fma_sse2); +weak_alias (__fma, fma) + +# define __fma __fma_sse2 +#endif + +#include <math/s_fma.c> diff --git a/libc/sysdeps/x86_64/multiarch/s_fmaf.c b/libc/sysdeps/x86_64/multiarch/s_fmaf.c new file mode 100644 index 000000000..f3d37f8f4 --- /dev/null +++ b/libc/sysdeps/x86_64/multiarch/s_fmaf.c @@ -0,0 +1,42 @@ +/* FMA version of fmaf. + Copyright (C) 2009 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <config.h> +#include <math.h> +#include <init-arch.h> + +#ifdef HAVE_AVX_SUPPORT + +extern float __fmaf_sse2 (float x, float y, float z); + + +float +__fmaf_fma (float x, float y, float z) +{ + asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); + return x; +} + +libm_ifunc (__fmaf, HAS_FMA ? __fmaf_fma : __fmaf_sse2); +weak_alias (__fmaf, fmaf) + +# define __fmaf __fmaf_sse2 +#endif + +#include <math/s_fmaf.c> diff --git a/libc/sysdeps/x86_64/tst-xmmymm.sh b/libc/sysdeps/x86_64/tst-xmmymm.sh index a576e7da0..da8af7e68 100755 --- a/libc/sysdeps/x86_64/tst-xmmymm.sh +++ b/libc/sysdeps/x86_64/tst-xmmymm.sh @@ -59,10 +59,11 @@ for f in $tocheck; do objdump -d "$objpfx"../*/"$f" | awk 'BEGIN { last="" } /^[[:xdigit:]]* <[_[:alnum:]]*>:$/ { fct=substr($2, 2, length($2)-3) } /,%[xy]mm[[:digit:]]*$/ { if (last != fct) { print fct; last=fct} }' | while read fct; do - if test "$fct" != "_dl_runtime_profile"; then - echo "function $fct in $f modifies xmm/ymm" >> "$tmp" - result=1 + if test "$fct" = "_dl_runtime_profile" -o "$fct" = "_dl_x86_64_restore_sse"; then + continue; fi + echo "function $fct in $f modifies xmm/ymm" >> "$tmp" + result=1 done done |