; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X64 ; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X86 ; On x86, an atomic rmw operation that does not modify the value in memory ; (such as atomic add 0) can be replaced by an mfence followed by a mov. ; This is explained (with the motivation for such an optimization) in ; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf define i8 @add8(i8* %p) { ; X64-LABEL: add8: ; X64: # %bb.0: ; X64-NEXT: mfence ; X64-NEXT: movb (%rdi), %al ; X64-NEXT: retq ; ; X86-LABEL: add8: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mfence ; X86-NEXT: movb (%eax), %al ; X86-NEXT: retl %1 = atomicrmw add i8* %p, i8 0 monotonic ret i8 %1 } define i16 @or16(i16* %p) { ; X64-LABEL: or16: ; X64: # %bb.0: ; X64-NEXT: mfence ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: retq ; ; X86-LABEL: or16: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mfence ; X86-NEXT: movzwl (%eax), %eax ; X86-NEXT: retl %1 = atomicrmw or i16* %p, i16 0 acquire ret i16 %1 } define i32 @xor32(i32* %p) { ; X64-LABEL: xor32: ; X64: # %bb.0: ; X64-NEXT: mfence ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: retq ; ; X86-LABEL: xor32: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mfence ; X86-NEXT: movl (%eax), %eax ; X86-NEXT: retl %1 = atomicrmw xor i32* %p, i32 0 release ret i32 %1 } define i64 @sub64(i64* %p) { ; X64-LABEL: sub64: ; X64: # %bb.0: ; X64-NEXT: mfence ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: retq ; ; X86-LABEL: sub64: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 12 ; X86-NEXT: .cfi_offset %esi, -12 ; X86-NEXT: .cfi_offset %ebx, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl (%esi), %eax ; X86-NEXT: movl 4(%esi), %edx ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB3_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: lock cmpxchg8b (%esi) ; X86-NEXT: jne .LBB3_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: popl %ebx ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl %1 = atomicrmw sub i64* %p, i64 0 seq_cst ret i64 %1 } define i128 @or128(i128* %p) { ; X64-LABEL: or128: ; X64: # %bb.0: ; X64-NEXT: pushq %rax ; X64-NEXT: .cfi_def_cfa_offset 16 ; X64-NEXT: xorl %esi, %esi ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: callq __sync_fetch_and_or_16 ; X64-NEXT: popq %rcx ; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq ; ; X86-LABEL: or128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: .cfi_def_cfa_register %ebp ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $16, %esp ; X86-NEXT: .cfi_offset %esi, -16 ; X86-NEXT: .cfi_offset %edi, -12 ; X86-NEXT: movl 8(%ebp), %esi ; X86-NEXT: movl %esp, %eax ; X86-NEXT: pushl $0 ; X86-NEXT: pushl $0 ; X86-NEXT: pushl $0 ; X86-NEXT: pushl $0 ; X86-NEXT: pushl 12(%ebp) ; X86-NEXT: pushl %eax ; X86-NEXT: calll __sync_fetch_and_or_16 ; X86-NEXT: addl $20, %esp ; X86-NEXT: movl (%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, 8(%esi) ; X86-NEXT: movl %edx, 12(%esi) ; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebp ; X86-NEXT: .cfi_def_cfa %esp, 4 ; X86-NEXT: retl $4 %1 = atomicrmw or i128* %p, i128 0 monotonic ret i128 %1 } ; For 'and', the idempotent value is (-1) define i32 @and32 (i32* %p) { ; X64-LABEL: and32: ; X64: # %bb.0: ; X64-NEXT: mfence ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: retq ; ; X86-LABEL: and32: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mfence ; X86-NEXT: movl (%eax), %eax ; X86-NEXT: retl %1 = atomicrmw and i32* %p, i32 -1 acq_rel ret i32 %1 } define void @or32_nouse_monotonic(i32* %p) { ; CHECK-LABEL: or32_nouse_monotonic: ; CHECK: # %bb.0: ; CHECK-NEXT: #MEMBARRIER ; CHECK-NEXT: ret{{[l|q]}} atomicrmw or i32* %p, i32 0 monotonic ret void } define void @or32_nouse_acquire(i32* %p) { ; CHECK-LABEL: or32_nouse_acquire: ; CHECK: # %bb.0: ; CHECK-NEXT: #MEMBARRIER ; CHECK-NEXT: ret{{[l|q]}} atomicrmw or i32* %p, i32 0 acquire ret void } define void @or32_nouse_release(i32* %p) { ; CHECK-LABEL: or32_nouse_release: ; CHECK: # %bb.0: ; CHECK-NEXT: #MEMBARRIER ; CHECK-NEXT: ret{{[l|q]}} atomicrmw or i32* %p, i32 0 release ret void } define void @or32_nouse_acq_rel(i32* %p) { ; CHECK-LABEL: or32_nouse_acq_rel: ; CHECK: # %bb.0: ; CHECK-NEXT: #MEMBARRIER ; CHECK-NEXT: ret{{[l|q]}} atomicrmw or i32* %p, i32 0 acq_rel ret void } define void @or32_nouse_seq_cst(i32* %p) { ; X64-LABEL: or32_nouse_seq_cst: ; X64: # %bb.0: ; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; X64-NEXT: retq ; ; X86-LABEL: or32_nouse_seq_cst: ; X86: # %bb.0: ; X86-NEXT: lock orl $0, (%esp) ; X86-NEXT: retl atomicrmw or i32* %p, i32 0 seq_cst ret void } ; TODO: The value isn't used on 32 bit, so the cmpxchg8b is unneeded define void @or64_nouse_seq_cst(i64* %p) { ; X64-LABEL: or64_nouse_seq_cst: ; X64: # %bb.0: ; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; X64-NEXT: retq ; ; X86-LABEL: or64_nouse_seq_cst: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 12 ; X86-NEXT: .cfi_offset %esi, -12 ; X86-NEXT: .cfi_offset %ebx, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl (%esi), %eax ; X86-NEXT: movl 4(%esi), %edx ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB11_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: lock cmpxchg8b (%esi) ; X86-NEXT: jne .LBB11_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: popl %ebx ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl atomicrmw or i64* %p, i64 0 seq_cst ret void } ; TODO: Don't need to lower as sync_and_fetch call define void @or128_nouse_seq_cst(i128* %p) { ; X64-LABEL: or128_nouse_seq_cst: ; X64: # %bb.0: ; X64-NEXT: pushq %rax ; X64-NEXT: .cfi_def_cfa_offset 16 ; X64-NEXT: xorl %esi, %esi ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: callq __sync_fetch_and_or_16 ; X64-NEXT: popq %rax ; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq ; ; X86-LABEL: or128_nouse_seq_cst: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: .cfi_def_cfa_register %ebp ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $16, %esp ; X86-NEXT: movl %esp, %eax ; X86-NEXT: pushl $0 ; X86-NEXT: pushl $0 ; X86-NEXT: pushl $0 ; X86-NEXT: pushl $0 ; X86-NEXT: pushl 8(%ebp) ; X86-NEXT: pushl %eax ; X86-NEXT: calll __sync_fetch_and_or_16 ; X86-NEXT: addl $20, %esp ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp ; X86-NEXT: .cfi_def_cfa %esp, 4 ; X86-NEXT: retl atomicrmw or i128* %p, i128 0 seq_cst ret void } define void @or16_nouse_seq_cst(i16* %p) { ; X64-LABEL: or16_nouse_seq_cst: ; X64: # %bb.0: ; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; X64-NEXT: retq ; ; X86-LABEL: or16_nouse_seq_cst: ; X86: # %bb.0: ; X86-NEXT: lock orl $0, (%esp) ; X86-NEXT: retl atomicrmw or i16* %p, i16 0 seq_cst ret void } define void @or8_nouse_seq_cst(i8* %p) { ; X64-LABEL: or8_nouse_seq_cst: ; X64: # %bb.0: ; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; X64-NEXT: retq ; ; X86-LABEL: or8_nouse_seq_cst: ; X86: # %bb.0: ; X86-NEXT: lock orl $0, (%esp) ; X86-NEXT: retl atomicrmw or i8* %p, i8 0 seq_cst ret void }