aboutsummaryrefslogtreecommitdiff
path: root/arch/powerpc/lib
diff options
context:
space:
mode:
authorAnton Blanchard <anton@samba.org>2010-08-12 16:28:09 +0000
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2010-11-29 15:48:17 +1100
commit64ff31287693c1f325cb9cb049569c1611438ef1 (patch)
tree3c159d2ca6f967fca13bae17cff19f92e0b3896c /arch/powerpc/lib
parent72083646528d4887b920deb71b37e09bc7d227bb (diff)
powerpc: Add support for popcnt instructions
POWER5 added popcntb, and POWER7 added popcntw and popcntd. As a first step this patch does all the work out of line, but it would be nice to implement them as inlines with an out of line fallback. The performance issue with hweight was noticed when disabling SMT on a large (192 thread) POWER7 box. The patch improves that testcase by about 8%. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/lib')
-rw-r--r--arch/powerpc/lib/Makefile2
-rw-r--r--arch/powerpc/lib/hweight_64.S110
2 files changed, 111 insertions, 1 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 889f2bc106d..166a6a0ad54 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -16,7 +16,7 @@ obj-$(CONFIG_HAS_IOMEM) += devres.o
obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \
memcpy_64.o usercopy_64.o mem_64.o string.o \
- checksum_wrappers_64.o
+ checksum_wrappers_64.o hweight_64.o
obj-$(CONFIG_XMON) += sstep.o ldstfp.o
obj-$(CONFIG_KPROBES) += sstep.o ldstfp.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += sstep.o ldstfp.o
diff --git a/arch/powerpc/lib/hweight_64.S b/arch/powerpc/lib/hweight_64.S
new file mode 100644
index 00000000000..ee2320bb5dd
--- /dev/null
+++ b/arch/powerpc/lib/hweight_64.S
@@ -0,0 +1,110 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2010
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+
+/* Note: This code relies on -mminimal-toc */
+
+_GLOBAL(__arch_hweight8)
+BEGIN_FTR_SECTION
+ b .__sw_hweight8
+ nop
+ nop
+FTR_SECTION_ELSE
+ popcntb r3,r3
+ clrldi r3,r3,64-8
+ blr
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)
+
+_GLOBAL(__arch_hweight16)
+BEGIN_FTR_SECTION
+ b .__sw_hweight16
+ nop
+ nop
+ nop
+ nop
+FTR_SECTION_ELSE
+ BEGIN_FTR_SECTION_NESTED(50)
+ popcntb r3,r3
+ srdi r4,r3,8
+ add r3,r4,r3
+ clrldi r3,r3,64-8
+ blr
+ FTR_SECTION_ELSE_NESTED(50)
+ clrlwi r3,r3,16
+ popcntw r3,r3
+ clrldi r3,r3,64-8
+ blr
+ ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 50)
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)
+
+_GLOBAL(__arch_hweight32)
+BEGIN_FTR_SECTION
+ b .__sw_hweight32
+ nop
+ nop
+ nop
+ nop
+ nop
+ nop
+FTR_SECTION_ELSE
+ BEGIN_FTR_SECTION_NESTED(51)
+ popcntb r3,r3
+ srdi r4,r3,16
+ add r3,r4,r3
+ srdi r4,r3,8
+ add r3,r4,r3
+ clrldi r3,r3,64-8
+ blr
+ FTR_SECTION_ELSE_NESTED(51)
+ popcntw r3,r3
+ clrldi r3,r3,64-8
+ blr
+ ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 51)
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)
+
+_GLOBAL(__arch_hweight64)
+BEGIN_FTR_SECTION
+ b .__sw_hweight64
+ nop
+ nop
+ nop
+ nop
+ nop
+ nop
+ nop
+ nop
+FTR_SECTION_ELSE
+ BEGIN_FTR_SECTION_NESTED(52)
+ popcntb r3,r3
+ srdi r4,r3,32
+ add r3,r4,r3
+ srdi r4,r3,16
+ add r3,r4,r3
+ srdi r4,r3,8
+ add r3,r4,r3
+ clrldi r3,r3,64-8
+ blr
+ FTR_SECTION_ELSE_NESTED(52)
+ popcntd r3,r3
+ clrldi r3,r3,64-8
+ blr
+ ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 52)
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)