aboutsummaryrefslogtreecommitdiff
path: root/simd
diff options
context:
space:
mode:
authorTom Gall <tom.gall@linaro.org>2011-08-29 09:53:17 -0500
committerTom Gall <tom.gall@linaro.org>2011-08-29 09:53:17 -0500
commite15681f491e8dc2762ad449980b0a3eafbddaca8 (patch)
tree9517c975d22fc1618c102743ba429cec1d64ef9f /simd
parentc14f6455230cbff4948fde981eed116298f5abb6 (diff)
parent4a72099711359606b1fc10c1744057a6c568d5d8 (diff)
clear out gorp
Diffstat (limited to 'simd')
-rwxr-xr-xsimd/CMakeLists.txt20
-rw-r--r--simd/Makefile.am17
-rw-r--r--simd/jcclrmmx.asm2
-rw-r--r--simd/jcclrss2-64.asm2
-rw-r--r--simd/jcclrss2.asm2
-rw-r--r--simd/jccolmmx.asm3
-rw-r--r--simd/jccolss2-64.asm3
-rw-r--r--simd/jccolss2.asm3
-rw-r--r--simd/jdclrmmx.asm2
-rw-r--r--simd/jdclrss2-64.asm2
-rw-r--r--simd/jdclrss2.asm2
-rw-r--r--simd/jdcolmmx.asm3
-rw-r--r--simd/jdcolss2-64.asm3
-rw-r--r--simd/jdcolss2.asm3
-rw-r--r--simd/jdmermmx.asm3
-rw-r--r--simd/jdmerss2-64.asm3
-rw-r--r--simd/jdmerss2.asm3
-rw-r--r--simd/jdmrgmmx.asm2
-rw-r--r--simd/jdmrgss2-64.asm2
-rw-r--r--simd/jdmrgss2.asm2
-rw-r--r--simd/jsimd.h129
-rw-r--r--simd/jsimd_arm.c30
-rw-r--r--simd/jsimd_arm_neon.S205
-rw-r--r--simd/jsimd_i386.c73
-rw-r--r--simd/jsimd_x86_64.c54
-rw-r--r--simd/jsimdext.inc14
26 files changed, 546 insertions, 41 deletions
diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
index 263579d..397a9f3 100755
--- a/simd/CMakeLists.txt
+++ b/simd/CMakeLists.txt
@@ -18,16 +18,16 @@ if(CMAKE_BUILD_TYPE STREQUAL "Debug"
endif()
if(SIMD_X86_64)
- set(SIMD_BASENAMES jfsseflt-64 jccolss2-64 jdcolss2-64 jcsamss2-64
- jdsamss2-64 jdmerss2-64 jcqnts2i-64 jfss2fst-64 jfss2int-64 jiss2red-64
- jiss2int-64 jiss2fst-64 jcqnts2f-64 jiss2flt-64)
+ set(SIMD_BASENAMES jfsseflt-64 jccolss2-64 jdcolss2-64 jcgrass2-64
+ jcsamss2-64 jdsamss2-64 jdmerss2-64 jcqnts2i-64 jfss2fst-64 jfss2int-64
+ jiss2red-64 jiss2int-64 jiss2fst-64 jcqnts2f-64 jiss2flt-64)
message(STATUS "Building x86_64 SIMD extensions")
else()
- set(SIMD_BASENAMES jsimdcpu jccolmmx jdcolmmx jcsammmx jdsammmx jdmermmx
- jcqntmmx jfmmxfst jfmmxint jimmxred jimmxint jimmxfst jcqnt3dn jf3dnflt
- ji3dnflt jcqntsse jfsseflt jisseflt jccolss2 jdcolss2 jcsamss2 jdsamss2
- jdmerss2 jcqnts2i jfss2fst jfss2int jiss2red jiss2int jiss2fst jcqnts2f
- jiss2flt)
+ set(SIMD_BASENAMES jsimdcpu jccolmmx jcgrammx jdcolmmx jcsammmx jdsammmx
+ jdmermmx jcqntmmx jfmmxfst jfmmxint jimmxred jimmxint jimmxfst jcqnt3dn
+ jf3dnflt ji3dnflt jcqntsse jfsseflt jisseflt jccolss2 jcgrass2 jdcolss2
+ jcsamss2 jdsamss2 jdmerss2 jcqnts2i jfss2fst jfss2int jiss2red jiss2int
+ jiss2fst jcqnts2f jiss2flt)
message(STATUS "Building i386 SIMD extensions")
endif()
@@ -48,6 +48,10 @@ foreach(file ${SIMD_BASENAMES})
set(DEPFILE ${SIMD_SRC})
string(REGEX REPLACE "mer" "mrg" DEPFILE ${DEPFILE})
endif()
+ if(${file} MATCHES gra)
+ set(DEPFILE ${SIMD_SRC})
+ string(REGEX REPLACE "gra" "gry" DEPFILE ${DEPFILE})
+ endif()
set(SIMD_OBJ ${OBJDIR}/${file}.obj)
add_custom_command(OUTPUT ${SIMD_OBJ} DEPENDS ${SIMD_SRC} ${DEPFILE} *.inc
COMMAND ${NASM} ${NAFLAGS} ${SIMD_SRC} -o${SIMD_OBJ})
diff --git a/simd/Makefile.am b/simd/Makefile.am
index 81c23af..fbba807 100644
--- a/simd/Makefile.am
+++ b/simd/Makefile.am
@@ -4,14 +4,14 @@ BUILT_SOURCES = jsimdcfg.inc
EXTRA_DIST = nasm_lt.sh jcclrmmx.asm jcclrss2.asm jdclrmmx.asm jdclrss2.asm \
jdmrgmmx.asm jdmrgss2.asm jcclrss2-64.asm jdclrss2-64.asm \
- jdmrgss2-64.asm CMakeLists.txt
+ jdmrgss2-64.asm jcgryss2-64.asm jcgrymmx.asm jcgryss2.asm CMakeLists.txt
if SIMD_X86_64
libsimd_la_SOURCES = jsimd_x86_64.c \
jsimd.h jsimdcfg.inc.h \
jsimdext.inc jcolsamp.inc jdct.inc \
- jfsseflt-64.asm \
+ jfsseflt-64.asm jcgrass2-64.asm \
jccolss2-64.asm jdcolss2-64.asm \
jcsamss2-64.asm jdsamss2-64.asm jdmerss2-64.asm \
jcqnts2i-64.asm jfss2fst-64.asm jfss2int-64.asm \
@@ -20,6 +20,7 @@ libsimd_la_SOURCES = jsimd_x86_64.c \
jccolss2-64.lo: jcclrss2-64.asm
jdcolss2-64.lo: jdclrss2-64.asm
+jcgrass2-64.lo: jcgryss2-64.asm
jdmerss2-64.lo: jdmrgss2-64.asm
endif
@@ -29,26 +30,34 @@ libsimd_la_SOURCES = jsimd_i386.c \
jsimd.h jsimdcfg.inc.h \
jsimdext.inc jcolsamp.inc jdct.inc \
jsimdcpu.asm \
- jccolmmx.asm jdcolmmx.asm \
+ jccolmmx.asm jdcolmmx.asm jcgrammx.asm \
jcsammmx.asm jdsammmx.asm jdmermmx.asm \
jcqntmmx.asm jfmmxfst.asm jfmmxint.asm \
jimmxred.asm jimmxint.asm jimmxfst.asm \
jcqnt3dn.asm jf3dnflt.asm ji3dnflt.asm \
jcqntsse.asm jfsseflt.asm jisseflt.asm \
- jccolss2.asm jdcolss2.asm \
+ jccolss2.asm jdcolss2.asm jcgrass2.asm \
jcsamss2.asm jdsamss2.asm jdmerss2.asm \
jcqnts2i.asm jfss2fst.asm jfss2int.asm \
jiss2red.asm jiss2int.asm jiss2fst.asm \
jcqnts2f.asm jiss2flt.asm
jccolmmx.lo: jcclrmmx.asm
+jcgrammx.lo: jcgrymmx.asm
jccolss2.lo: jcclrss2.asm
+jcgrass2.lo: jcgryss2.asm
jdcolmmx.lo: jdclrmmx.asm
jdcolss2.lo: jdclrss2.asm
jdmermmx.lo: jdmrgmmx.asm
jdmerss2.lo: jdmrgss2.asm
endif
+if SIMD_ARM
+
+libsimd_la_SOURCES = jsimd_arm.c jsimd_arm_neon.S
+
+endif
+
AM_CPPFLAGS = -I$(top_srcdir)
.asm.lo:
diff --git a/simd/jcclrmmx.asm b/simd/jcclrmmx.asm
index b6b8912..e095253 100644
--- a/simd/jcclrmmx.asm
+++ b/simd/jcclrmmx.asm
@@ -19,8 +19,6 @@
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
;
; Convert some rows of samples to the output colorspace.
;
diff --git a/simd/jcclrss2-64.asm b/simd/jcclrss2-64.asm
index 8ca47aa..f5d6bed 100644
--- a/simd/jcclrss2-64.asm
+++ b/simd/jcclrss2-64.asm
@@ -17,8 +17,6 @@
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
;
; Convert some rows of samples to the output colorspace.
;
diff --git a/simd/jcclrss2.asm b/simd/jcclrss2.asm
index 8def718..517b705 100644
--- a/simd/jcclrss2.asm
+++ b/simd/jcclrss2.asm
@@ -16,8 +16,6 @@
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
;
; Convert some rows of samples to the output colorspace.
;
diff --git a/simd/jccolmmx.asm b/simd/jccolmmx.asm
index 5e7f3be..d0f0d79 100644
--- a/simd/jccolmmx.asm
+++ b/simd/jccolmmx.asm
@@ -51,6 +51,9 @@ PD_ONEHALF times 2 dd (1 << (SCALEBITS-1))
alignz 16
; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
%include "jcclrmmx.asm"
%undef RGB_RED
diff --git a/simd/jccolss2-64.asm b/simd/jccolss2-64.asm
index 64ee0ba..18de456 100644
--- a/simd/jccolss2-64.asm
+++ b/simd/jccolss2-64.asm
@@ -48,6 +48,9 @@ PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
alignz 16
; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
%include "jcclrss2-64.asm"
%undef RGB_RED
diff --git a/simd/jccolss2.asm b/simd/jccolss2.asm
index 8d1f734..7acb59c 100644
--- a/simd/jccolss2.asm
+++ b/simd/jccolss2.asm
@@ -48,6 +48,9 @@ PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
alignz 16
; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
%include "jcclrss2.asm"
%undef RGB_RED
diff --git a/simd/jdclrmmx.asm b/simd/jdclrmmx.asm
index 79772e0..1c255e8 100644
--- a/simd/jdclrmmx.asm
+++ b/simd/jdclrmmx.asm
@@ -19,8 +19,6 @@
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
;
; Convert some rows of samples to the output colorspace.
;
diff --git a/simd/jdclrss2-64.asm b/simd/jdclrss2-64.asm
index 4282bd2..fdb33a3 100644
--- a/simd/jdclrss2-64.asm
+++ b/simd/jdclrss2-64.asm
@@ -20,8 +20,6 @@
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
;
; Convert some rows of samples to the output colorspace.
;
diff --git a/simd/jdclrss2.asm b/simd/jdclrss2.asm
index 865fa82..3059d7d 100644
--- a/simd/jdclrss2.asm
+++ b/simd/jdclrss2.asm
@@ -19,8 +19,6 @@
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
;
; Convert some rows of samples to the output colorspace.
;
diff --git a/simd/jdcolmmx.asm b/simd/jdcolmmx.asm
index 58775e8..0834bab 100644
--- a/simd/jdcolmmx.asm
+++ b/simd/jdcolmmx.asm
@@ -48,6 +48,9 @@ PD_ONEHALF times 2 dd 1 << (SCALEBITS-1)
alignz 16
; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
%include "jdclrmmx.asm"
%undef RGB_RED
diff --git a/simd/jdcolss2-64.asm b/simd/jdcolss2-64.asm
index 2e97d59..d14a28a 100644
--- a/simd/jdcolss2-64.asm
+++ b/simd/jdcolss2-64.asm
@@ -48,6 +48,9 @@ PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
alignz 16
; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
%include "jdclrss2-64.asm"
%undef RGB_RED
diff --git a/simd/jdcolss2.asm b/simd/jdcolss2.asm
index 7ae985d..cab4dd0 100644
--- a/simd/jdcolss2.asm
+++ b/simd/jdcolss2.asm
@@ -48,6 +48,9 @@ PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
alignz 16
; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
%include "jdclrss2.asm"
%undef RGB_RED
diff --git a/simd/jdmermmx.asm b/simd/jdmermmx.asm
index fd587fb..75baaa5 100644
--- a/simd/jdmermmx.asm
+++ b/simd/jdmermmx.asm
@@ -48,6 +48,9 @@ PD_ONEHALF times 2 dd 1 << (SCALEBITS-1)
alignz 16
; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
%include "jdmrgmmx.asm"
%undef RGB_RED
diff --git a/simd/jdmerss2-64.asm b/simd/jdmerss2-64.asm
index 1f0b10f..a1fe963 100644
--- a/simd/jdmerss2-64.asm
+++ b/simd/jdmerss2-64.asm
@@ -48,6 +48,9 @@ PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
alignz 16
; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
%include "jdmrgss2-64.asm"
%undef RGB_RED
diff --git a/simd/jdmerss2.asm b/simd/jdmerss2.asm
index 2294e0d..21881b4 100644
--- a/simd/jdmerss2.asm
+++ b/simd/jdmerss2.asm
@@ -48,6 +48,9 @@ PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
alignz 16
; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
%include "jdmrgss2.asm"
%undef RGB_RED
diff --git a/simd/jdmrgmmx.asm b/simd/jdmrgmmx.asm
index b5777a3..d0800a7 100644
--- a/simd/jdmrgmmx.asm
+++ b/simd/jdmrgmmx.asm
@@ -19,8 +19,6 @@
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
;
; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
;
diff --git a/simd/jdmrgss2-64.asm b/simd/jdmrgss2-64.asm
index 121bb82..0c2503f 100644
--- a/simd/jdmrgss2-64.asm
+++ b/simd/jdmrgss2-64.asm
@@ -20,8 +20,6 @@
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
;
; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
;
diff --git a/simd/jdmrgss2.asm b/simd/jdmrgss2.asm
index 99b7eb9..368ac3c 100644
--- a/simd/jdmrgss2.asm
+++ b/simd/jdmrgss2.asm
@@ -19,8 +19,6 @@
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
;
; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
;
diff --git a/simd/jsimd.h b/simd/jsimd.h
index 89ac1b7..39a0867 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -2,6 +2,7 @@
* simd/jsimd.h
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2011 D. R. Commander
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -11,11 +12,12 @@
/* Bitmask for supported acceleration methods */
-#define JSIMD_NONE 0x00
-#define JSIMD_MMX 0x01
-#define JSIMD_3DNOW 0x02
-#define JSIMD_SSE 0x04
-#define JSIMD_SSE2 0x08
+#define JSIMD_NONE 0x00
+#define JSIMD_MMX 0x01
+#define JSIMD_3DNOW 0x02
+#define JSIMD_SSE 0x04
+#define JSIMD_SSE2 0x08
+#define JSIMD_ARM_NEON 0x10
/* Short forms of external names for systems with brain-damaged linkers. */
@@ -28,6 +30,13 @@
#define jsimd_extbgrx_ycc_convert_mmx jSEXTBGRXYCCM
#define jsimd_extxbgr_ycc_convert_mmx jSEXTXBGRYCCM
#define jsimd_extxrgb_ycc_convert_mmx jSEXTXRGBYCCM
+#define jsimd_rgb_gray_convert_mmx jSRGBGRYM
+#define jsimd_extrgb_gray_convert_mmx jSEXTRGBGRYM
+#define jsimd_extrgbx_gray_convert_mmx jSEXTRGBXGRYM
+#define jsimd_extbgr_gray_convert_mmx jSEXTBGRGRYM
+#define jsimd_extbgrx_gray_convert_mmx jSEXTBGRXGRYM
+#define jsimd_extxbgr_gray_convert_mmx jSEXTXBGRGRYM
+#define jsimd_extxrgb_gray_convert_mmx jSEXTXRGBGRYM
#define jsimd_ycc_rgb_convert_mmx jSYCCRGBM
#define jsimd_ycc_extrgb_convert_mmx jSYCCEXTRGBM
#define jsimd_ycc_extrgbx_convert_mmx jSYCCEXTRGBXM
@@ -43,6 +52,14 @@
#define jsimd_extbgrx_ycc_convert_sse2 jSEXTBGRXYCCS2
#define jsimd_extxbgr_ycc_convert_sse2 jSEXTXBGRYCCS2
#define jsimd_extxrgb_ycc_convert_sse2 jSEXTXRGBYCCS2
+#define jconst_rgb_gray_convert_sse2 jSCRGBGRYS2
+#define jsimd_rgb_gray_convert_sse2 jSRGBGRYS2
+#define jsimd_extrgb_gray_convert_sse2 jSEXTRGBGRYS2
+#define jsimd_extrgbx_gray_convert_sse2 jSEXTRGBXGRYS2
+#define jsimd_extbgr_gray_convert_sse2 jSEXTBGRGRYS2
+#define jsimd_extbgrx_gray_convert_sse2 jSEXTBGRXGRYS2
+#define jsimd_extxbgr_gray_convert_sse2 jSEXTXBGRGRYS2
+#define jsimd_extxrgb_gray_convert_sse2 jSEXTXRGBGRYS2
#define jconst_ycc_rgb_convert_sse2 jSCYCCRGBS2
#define jsimd_ycc_rgb_convert_sse2 jSYCCRGBS2
#define jsimd_ycc_extrgb_convert_sse2 jSYCCEXTRGBS2
@@ -163,6 +180,35 @@ EXTERN(void) jsimd_extxrgb_ycc_convert_mmx
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_rgb_gray_convert_mmx
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_gray_convert_mmx
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_gray_convert_mmx
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_gray_convert_mmx
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_gray_convert_mmx
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_gray_convert_mmx
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_gray_convert_mmx
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+
EXTERN(void) jsimd_ycc_rgb_convert_mmx
JPP((JDIMENSION out_width,
JSAMPIMAGE input_buf, JDIMENSION input_row,
@@ -222,6 +268,36 @@ EXTERN(void) jsimd_extxrgb_ycc_convert_sse2
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
+extern const int jconst_rgb_gray_convert_sse2[];
+EXTERN(void) jsimd_rgb_gray_convert_sse2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_gray_convert_sse2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_gray_convert_sse2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_gray_convert_sse2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_gray_convert_sse2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_gray_convert_sse2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_gray_convert_sse2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+
extern const int jconst_ycc_rgb_convert_sse2[];
EXTERN(void) jsimd_ycc_rgb_convert_sse2
JPP((JDIMENSION out_width,
@@ -252,6 +328,35 @@ EXTERN(void) jsimd_ycc_extxrgb_convert_sse2
JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_rgb_convert_neon
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgb_convert_neon
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgbx_convert_neon
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgr_convert_neon
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgrx_convert_neon
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxbgr_convert_neon
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxrgb_convert_neon
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+
/* SIMD Downsample */
EXTERN(void) jsimd_h2v2_downsample_mmx
JPP((JDIMENSION image_width, int max_v_samp_factor,
@@ -464,6 +569,15 @@ EXTERN(void) jsimd_idct_4x4_sse2 JPP((void * dct_table,
JSAMPARRAY output_buf,
JDIMENSION output_col));
+EXTERN(void) jsimd_idct_2x2_neon JPP((void * dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+EXTERN(void) jsimd_idct_4x4_neon JPP((void * dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+
/* SIMD Inverse DCT */
EXTERN(void) jsimd_idct_islow_mmx JPP((void * dct_table,
JCOEFPTR coef_block,
@@ -485,6 +599,11 @@ EXTERN(void) jsimd_idct_ifast_sse2 JPP((void * dct_table,
JSAMPARRAY output_buf,
JDIMENSION output_col));
+EXTERN(void) jsimd_idct_ifast_neon JPP((void * dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+
EXTERN(void) jsimd_idct_float_3dnow JPP((void * dct_table,
JCOEFPTR coef_block,
JSAMPARRAY output_buf,
diff --git a/simd/jsimd_arm.c b/simd/jsimd_arm.c
index a9d920c..9ed49fd 100644
--- a/simd/jsimd_arm.c
+++ b/simd/jsimd_arm.c
@@ -136,6 +136,7 @@ jsimd_can_rgb_ycc (void)
{
init_simd();
+<<<<<<< HEAD
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
@@ -147,6 +148,8 @@ jsimd_can_rgb_ycc (void)
if (simd_support & JSIMD_ARM_NEON)
return 1;
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
return 0;
}
@@ -181,6 +184,7 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows)
{
+<<<<<<< HEAD
void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
switch(cinfo->in_color_space)
@@ -211,6 +215,8 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
if (simd_support & JSIMD_ARM_NEON)
neonfct(cinfo->image_width, input_buf,
output_buf, output_row, num_rows);
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
}
GLOBAL(void)
@@ -386,6 +392,7 @@ jsimd_can_convsamp (void)
{
init_simd();
+<<<<<<< HEAD
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
@@ -399,6 +406,8 @@ jsimd_can_convsamp (void)
if (simd_support & JSIMD_ARM_NEON)
return 1;
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
return 0;
}
@@ -414,8 +423,11 @@ GLOBAL(void)
jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
DCTELEM * workspace)
{
+<<<<<<< HEAD
if (simd_support & JSIMD_ARM_NEON)
jsimd_convsamp_neon(sample_data, start_col, workspace);
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
}
GLOBAL(void)
@@ -437,6 +449,7 @@ jsimd_can_fdct_ifast (void)
{
init_simd();
+<<<<<<< HEAD
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
@@ -446,6 +459,8 @@ jsimd_can_fdct_ifast (void)
if (simd_support & JSIMD_ARM_NEON)
return 1;
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
return 0;
}
@@ -465,8 +480,11 @@ jsimd_fdct_islow (DCTELEM * data)
GLOBAL(void)
jsimd_fdct_ifast (DCTELEM * data)
{
+<<<<<<< HEAD
if (simd_support & JSIMD_ARM_NEON)
jsimd_fdct_ifast_neon(data);
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
}
GLOBAL(void)
@@ -479,6 +497,7 @@ jsimd_can_quantize (void)
{
init_simd();
+<<<<<<< HEAD
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
@@ -490,6 +509,8 @@ jsimd_can_quantize (void)
if (simd_support & JSIMD_ARM_NEON)
return 1;
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
return 0;
}
@@ -505,8 +526,11 @@ GLOBAL(void)
jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
DCTELEM * workspace)
{
+<<<<<<< HEAD
if (simd_support & JSIMD_ARM_NEON)
jsimd_quantize_neon(coef_block, divisors, workspace);
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
}
GLOBAL(void)
@@ -584,6 +608,7 @@ jsimd_can_idct_islow (void)
{
init_simd();
+<<<<<<< HEAD
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
@@ -599,6 +624,8 @@ jsimd_can_idct_islow (void)
if (simd_support & JSIMD_ARM_NEON)
return 1;
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
return 0;
}
@@ -640,8 +667,11 @@ jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
+<<<<<<< HEAD
if ((simd_support & JSIMD_ARM_NEON))
jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf, output_col);
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
}
GLOBAL(void)
diff --git a/simd/jsimd_arm_neon.S b/simd/jsimd_arm_neon.S
index 9ef6efc..3f1cf9e 100644
--- a/simd/jsimd_arm_neon.S
+++ b/simd/jsimd_arm_neon.S
@@ -62,6 +62,7 @@ _\fname:
vtrn.32 \x1, \x3
.endm
+<<<<<<< HEAD
#define CENTERJSAMPLE 128
/*****************************************************************************/
@@ -536,6 +537,8 @@ asm_function jsimd_idct_islow_neon
.unreq ROW7R
.endfunc
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
/*****************************************************************************/
/*
@@ -543,6 +546,7 @@ asm_function jsimd_idct_islow_neon
*
* This function contains a fast, not so accurate integer implementation of
* the inverse DCT (Discrete Cosine Transform). It uses the same calculations
+<<<<<<< HEAD
* and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
* function from jidctfst.c
*
@@ -552,6 +556,12 @@ asm_function jsimd_idct_islow_neon
* like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
* which introduces an extra addition. Overall, there are 6 extra additions
* per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
+=======
+ * and produces exactly the same output as IJG's original 'jpeg_idct_fast'
+ * function from jidctfst.c
+ *
+ * TODO: a bit better instructions scheduling is needed.
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
*/
#define XFIX_1_082392200 d0[0]
@@ -566,12 +576,70 @@ jsimd_idct_ifast_neon_consts:
.short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
.short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
+<<<<<<< HEAD
+=======
+/* 1-D IDCT helper macro */
+
+.macro idct_helper x0, x1, x2, x3, x4, x5, x6, x7, \
+ t10, t11, t12, t13, t14
+
+ vsub.s16 \t10, \x0, \x4
+ vadd.s16 \x4, \x0, \x4
+ vswp.s16 \t10, \x0
+ vsub.s16 \t11, \x2, \x6
+ vadd.s16 \x6, \x2, \x6
+ vswp.s16 \t11, \x2
+ vsub.s16 \t10, \x3, \x5
+ vadd.s16 \x5, \x3, \x5
+ vswp.s16 \t10, \x3
+ vsub.s16 \t11, \x1, \x7
+ vadd.s16 \x7, \x1, \x7
+ vswp.s16 \t11, \x1
+
+ vqdmulh.s16 \t13, \x2, d0[1]
+ vadd.s16 \t12, \x3, \x3
+ vadd.s16 \x2, \x2, \t13
+ vqdmulh.s16 \t13, \x3, d0[3]
+ vsub.s16 \t10, \x1, \x3
+ vadd.s16 \t12, \t12, \t13
+ vqdmulh.s16 \t13, \t10, d0[2]
+ vsub.s16 \t11, \x7, \x5
+ vadd.s16 \t10, \t10, \t13
+ vqdmulh.s16 \t13, \t11, d0[1]
+ vadd.s16 \t11, \t11, \t13
+
+ vqdmulh.s16 \t13, \x1, d0[0]
+ vsub.s16 \x2, \x6, \x2
+ vsub.s16 \t14, \x0, \x2
+ vadd.s16 \x2, \x0, \x2
+ vadd.s16 \x0, \x4, \x6
+ vsub.s16 \x4, \x4, \x6
+ vadd.s16 \x1, \x1, \t13
+ vadd.s16 \t13, \x7, \x5
+ vsub.s16 \t12, \t13, \t12
+ vsub.s16 \t12, \t12, \t10
+ vadd.s16 \t11, \t12, \t11
+ vsub.s16 \t10, \x1, \t10
+ vadd.s16 \t10, \t10, \t11
+
+ vsub.s16 \x7, \x0, \t13
+ vadd.s16 \x0, \x0, \t13
+ vadd.s16 \x6, \t14, \t12
+ vsub.s16 \x1, \t14, \t12
+ vsub.s16 \x5, \x2, \t11
+ vadd.s16 \x2, \x2, \t11
+ vsub.s16 \x3, \x4, \t10
+ vadd.s16 \x4, \x4, \t10
+.endm
+
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
asm_function jsimd_idct_ifast_neon
DCT_TABLE .req r0
COEF_BLOCK .req r1
OUTPUT_BUF .req r2
OUTPUT_COL .req r3
+<<<<<<< HEAD
TMP1 .req r0
TMP2 .req r1
TMP3 .req r2
@@ -756,18 +824,117 @@ asm_function jsimd_idct_ifast_neon
vst1.8 {d21}, [TMP2]
vst1.8 {d22}, [TMP3]
vst1.8 {d23}, [TMP4]
+=======
+ TMP .req ip
+
+ vpush {d8-d15}
+
+ /* Load constants */
+ adr TMP, jsimd_idct_ifast_neon_consts
+ vld1.16 {d0}, [TMP, :64]
+
+ /* Load all COEF_BLOCK into NEON registers with the following allocation:
+ * 0 1 2 3 | 4 5 6 7
+ * ---------+--------
+ * 0 | d4 | d5
+ * 1 | d6 | d7
+ * 2 | d8 | d9
+ * 3 | d10 | d11
+ * 4 | d12 | d13
+ * 5 | d14 | d15
+ * 6 | d16 | d17
+ * 7 | d18 | d19
+ */
+ vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK]!
+ vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK]!
+ vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK]!
+ vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK]!
+ /* Dequantize */
+ vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]!
+ vmul.s16 q2, q2, q10
+ vld1.16 {d24, d25, d26, d27}, [DCT_TABLE]!
+ vmul.s16 q3, q3, q11
+ vmul.s16 q4, q4, q12
+ vld1.16 {d28, d29, d30, d31}, [DCT_TABLE]!
+ vmul.s16 q5, q5, q13
+ vmul.s16 q6, q6, q14
+ vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]!
+ vmul.s16 q7, q7, q15
+ vmul.s16 q8, q8, q10
+ vmul.s16 q9, q9, q11
+
+ /* Pass 1 */
+ idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
+ /* Transpose */
+ transpose_4x4 d4, d6, d8, d10
+ transpose_4x4 d5, d7, d9, d11
+ transpose_4x4 d12, d14, d16, d18
+ transpose_4x4 d13, d15, d17, d19
+ vswp d12, d5
+ vswp d14, d7
+ vswp d16, d9
+ vswp d18, d11
+
+ /* Pass 2 */
+ idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
+ /* Transpose */
+ transpose_4x4 d4, d6, d8, d10
+ transpose_4x4 d5, d7, d9, d11
+ transpose_4x4 d12, d14, d16, d18
+ transpose_4x4 d13, d15, d17, d19
+ vswp d12, d5
+ vswp d14, d7
+ vswp d16, d9
+ vswp d18, d11
+
+ /* Descale and range limit */
+ vmov.s16 q15, #(0x80 << 5)
+ vqadd.s16 q2, q2, q15
+ vqadd.s16 q3, q3, q15
+ vqadd.s16 q4, q4, q15
+ vqadd.s16 q5, q5, q15
+ vqadd.s16 q6, q6, q15
+ vqadd.s16 q7, q7, q15
+ vqadd.s16 q8, q8, q15
+ vqadd.s16 q9, q9, q15
+ vqshrun.s16 d4, q2, #5
+ vqshrun.s16 d6, q3, #5
+ vqshrun.s16 d8, q4, #5
+ vqshrun.s16 d10, q5, #5
+ vqshrun.s16 d12, q6, #5
+ vqshrun.s16 d14, q7, #5
+ vqshrun.s16 d16, q8, #5
+ vqshrun.s16 d18, q9, #5
+
+ /* Store results to the output buffer */
+ .irp x, d4, d6, d8, d10, d12, d14, d16, d18
+ ldr TMP, [OUTPUT_BUF], #4
+ add TMP, TMP, OUTPUT_COL
+ vst1.8 {\x}, [TMP]!
+ .endr
+
+ vpop {d8-d15}
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
bx lr
.unreq DCT_TABLE
.unreq COEF_BLOCK
.unreq OUTPUT_BUF
.unreq OUTPUT_COL
+<<<<<<< HEAD
.unreq TMP1
.unreq TMP2
.unreq TMP3
.unreq TMP4
.endfunc
+=======
+ .unreq TMP
+.endfunc
+
+.purgem idct_helper
+
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
/*****************************************************************************/
/*
@@ -1152,12 +1319,21 @@ asm_function jsimd_idct_2x2_neon
.macro do_load size
.if \size == 8
+<<<<<<< HEAD
vld1.8 {d4}, [U, :64]!
vld1.8 {d5}, [V, :64]!
vld1.8 {d0}, [Y, :64]!
pld [U, #64]
pld [V, #64]
pld [Y, #64]
+=======
+ vld1.8 {d4}, [U]!
+ vld1.8 {d5}, [V]!
+ vld1.8 {d0}, [Y]!
+ pld [Y, #64]
+ pld [U, #64]
+ pld [V, #64]
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
.elseif \size == 4
vld1.8 {d4[0]}, [U]!
vld1.8 {d4[1]}, [U]!
@@ -1227,11 +1403,15 @@ asm_function jsimd_idct_2x2_neon
.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
+<<<<<<< HEAD
/*
* 2 stage pipelined YCbCr->RGB conversion
*/
.macro do_yuv_to_rgb_stage1
+=======
+.macro do_yuv_to_rgb
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
@@ -1242,9 +1422,12 @@ asm_function jsimd_idct_2x2_neon
vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
+<<<<<<< HEAD
.endm
.macro do_yuv_to_rgb_stage2
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
vrshrn.s32 d20, q10, #15
vrshrn.s32 d21, q11, #15
vrshrn.s32 d24, q12, #14
@@ -1259,6 +1442,7 @@ asm_function jsimd_idct_2x2_neon
vqmovun.s16 d1\b_offs, q14
.endm
+<<<<<<< HEAD
.macro do_yuv_to_rgb_stage2_store_load_stage1
vld1.8 {d4}, [U, :64]!
vrshrn.s32 d20, q10, #15
@@ -1296,6 +1480,8 @@ asm_function jsimd_idct_2x2_neon
do_yuv_to_rgb_stage2
.endm
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
/* Apple gas crashes on adrl, work around that by using adr.
* But this requires a copy of these constants for each function.
*/
@@ -1356,6 +1542,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
/* Inner loop over pixels */
subs N, N, #8
+<<<<<<< HEAD
blt 3f
do_load 8
do_yuv_to_rgb_stage1
@@ -1371,6 +1558,18 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
tst N, #7
beq 8f
3:
+=======
+ blt 2f
+1:
+ do_load 8
+ do_yuv_to_rgb
+ do_store \bpp, 8
+ subs N, N, #8
+ bge 1b
+ tst N, #7
+ beq 8f
+2:
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
tst N, #4
beq 3f
do_load 4
@@ -1418,9 +1617,12 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
.endfunc
.purgem do_yuv_to_rgb
+<<<<<<< HEAD
.purgem do_yuv_to_rgb_stage1
.purgem do_yuv_to_rgb_stage2
.purgem do_yuv_to_rgb_stage2_store_load_stage1
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
.endm
@@ -1436,6 +1638,7 @@ generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
.purgem do_store
/*****************************************************************************/
+<<<<<<< HEAD
/*
* jsimd_extrgb_ycc_convert_neon
@@ -2031,3 +2234,5 @@ asm_function jsimd_quantize_neon
.unreq SHIFT
.unreq LOOP_COUNT
.endfunc
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
diff --git a/simd/jsimd_i386.c b/simd/jsimd_i386.c
index d9bb774..f77c5ef 100644
--- a/simd/jsimd_i386.c
+++ b/simd/jsimd_i386.c
@@ -2,7 +2,7 @@
* jsimd_i386.c
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009 D. R. Commander
+ * Copyright 2009-2011 D. R. Commander
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -84,6 +84,28 @@ jsimd_can_rgb_ycc (void)
}
GLOBAL(int)
+jsimd_can_rgb_gray (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
jsimd_can_ycc_rgb (void)
{
init_simd();
@@ -155,6 +177,55 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
}
GLOBAL(void)
+jsimd_rgb_gray_convert (j_compress_ptr cinfo,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows)
+{
+ void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+ void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch(cinfo->in_color_space)
+ {
+ case JCS_EXT_RGB:
+ sse2fct=jsimd_extrgb_gray_convert_sse2;
+ mmxfct=jsimd_extrgb_gray_convert_mmx;
+ break;
+ case JCS_EXT_RGBX:
+ sse2fct=jsimd_extrgbx_gray_convert_sse2;
+ mmxfct=jsimd_extrgbx_gray_convert_mmx;
+ break;
+ case JCS_EXT_BGR:
+ sse2fct=jsimd_extbgr_gray_convert_sse2;
+ mmxfct=jsimd_extbgr_gray_convert_mmx;
+ break;
+ case JCS_EXT_BGRX:
+ sse2fct=jsimd_extbgrx_gray_convert_sse2;
+ mmxfct=jsimd_extbgrx_gray_convert_mmx;
+ break;
+ case JCS_EXT_XBGR:
+ sse2fct=jsimd_extxbgr_gray_convert_sse2;
+ mmxfct=jsimd_extxbgr_gray_convert_mmx;
+ break;
+ case JCS_EXT_XRGB:
+ sse2fct=jsimd_extxrgb_gray_convert_sse2;
+ mmxfct=jsimd_extxrgb_gray_convert_mmx;
+ break;
+ default:
+ sse2fct=jsimd_rgb_gray_convert_sse2;
+ mmxfct=jsimd_rgb_gray_convert_mmx;
+ break;
+ }
+
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+ sse2fct(cinfo->image_width, input_buf,
+ output_buf, output_row, num_rows);
+ else if (simd_support & JSIMD_MMX)
+ mmxfct(cinfo->image_width, input_buf,
+ output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows)
diff --git a/simd/jsimd_x86_64.c b/simd/jsimd_x86_64.c
index 7659249..2951268 100644
--- a/simd/jsimd_x86_64.c
+++ b/simd/jsimd_x86_64.c
@@ -2,7 +2,7 @@
* jsimd_x86_64.c
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009 D. R. Commander
+ * Copyright 2009-2011 D. R. Commander
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -47,6 +47,23 @@ jsimd_can_rgb_ycc (void)
}
GLOBAL(int)
+jsimd_can_rgb_gray (void)
+{
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (!IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(int)
jsimd_can_ycc_rgb (void)
{
/* The code is optimised for these values only */
@@ -99,6 +116,41 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
}
GLOBAL(void)
+jsimd_rgb_gray_convert (j_compress_ptr cinfo,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows)
+{
+ void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch(cinfo->in_color_space)
+ {
+ case JCS_EXT_RGB:
+ sse2fct=jsimd_extrgb_gray_convert_sse2;
+ break;
+ case JCS_EXT_RGBX:
+ sse2fct=jsimd_extrgbx_gray_convert_sse2;
+ break;
+ case JCS_EXT_BGR:
+ sse2fct=jsimd_extbgr_gray_convert_sse2;
+ break;
+ case JCS_EXT_BGRX:
+ sse2fct=jsimd_extbgrx_gray_convert_sse2;
+ break;
+ case JCS_EXT_XBGR:
+ sse2fct=jsimd_extxbgr_gray_convert_sse2;
+ break;
+ case JCS_EXT_XRGB:
+ sse2fct=jsimd_extxrgb_gray_convert_sse2;
+ break;
+ default:
+ sse2fct=jsimd_rgb_gray_convert_sse2;
+ break;
+ }
+
+ sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows)
diff --git a/simd/jsimdext.inc b/simd/jsimdext.inc
index c4297f9..635a931 100644
--- a/simd/jsimdext.inc
+++ b/simd/jsimdext.inc
@@ -38,16 +38,26 @@
; -- segment definition --
;
+%ifdef __YASM_VER__
+%define SEG_TEXT .text align=16
+%define SEG_CONST .rdata align=16
+%else
%define SEG_TEXT .text align=16 public use32 class=CODE
%define SEG_CONST .rdata align=16 public use32 class=CONST
+%endif
%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)--------
; * Microsoft Visual C++
; -- segment definition --
;
+%ifdef __YASM_VER__
+%define SEG_TEXT .text align=16
+%define SEG_CONST .rdata align=16
+%else
%define SEG_TEXT .text align=16 public use64 class=CODE
%define SEG_CONST .rdata align=16 public use64 class=CONST
+%endif
%define EXTN(name) name ; foo() -> foo
%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)----------
@@ -297,8 +307,6 @@ const_base:
%ifdef WIN64
%imacro collect_args 0
- push r10
- push r11
push r12
push r13
push r14
@@ -328,8 +336,6 @@ const_base:
pop r14
pop r13
pop r12
- pop r11
- pop r10
%endmacro
%else