diff options
author | Tom Gall <tom.gall@linaro.org> | 2011-08-29 09:53:17 -0500 |
---|---|---|
committer | Tom Gall <tom.gall@linaro.org> | 2011-08-29 09:53:17 -0500 |
commit | e15681f491e8dc2762ad449980b0a3eafbddaca8 (patch) | |
tree | 9517c975d22fc1618c102743ba429cec1d64ef9f /simd | |
parent | c14f6455230cbff4948fde981eed116298f5abb6 (diff) | |
parent | 4a72099711359606b1fc10c1744057a6c568d5d8 (diff) |
clear out gorp
Diffstat (limited to 'simd')
-rwxr-xr-x | simd/CMakeLists.txt | 20 | ||||
-rw-r--r-- | simd/Makefile.am | 17 | ||||
-rw-r--r-- | simd/jcclrmmx.asm | 2 | ||||
-rw-r--r-- | simd/jcclrss2-64.asm | 2 | ||||
-rw-r--r-- | simd/jcclrss2.asm | 2 | ||||
-rw-r--r-- | simd/jccolmmx.asm | 3 | ||||
-rw-r--r-- | simd/jccolss2-64.asm | 3 | ||||
-rw-r--r-- | simd/jccolss2.asm | 3 | ||||
-rw-r--r-- | simd/jdclrmmx.asm | 2 | ||||
-rw-r--r-- | simd/jdclrss2-64.asm | 2 | ||||
-rw-r--r-- | simd/jdclrss2.asm | 2 | ||||
-rw-r--r-- | simd/jdcolmmx.asm | 3 | ||||
-rw-r--r-- | simd/jdcolss2-64.asm | 3 | ||||
-rw-r--r-- | simd/jdcolss2.asm | 3 | ||||
-rw-r--r-- | simd/jdmermmx.asm | 3 | ||||
-rw-r--r-- | simd/jdmerss2-64.asm | 3 | ||||
-rw-r--r-- | simd/jdmerss2.asm | 3 | ||||
-rw-r--r-- | simd/jdmrgmmx.asm | 2 | ||||
-rw-r--r-- | simd/jdmrgss2-64.asm | 2 | ||||
-rw-r--r-- | simd/jdmrgss2.asm | 2 | ||||
-rw-r--r-- | simd/jsimd.h | 129 | ||||
-rw-r--r-- | simd/jsimd_arm.c | 30 | ||||
-rw-r--r-- | simd/jsimd_arm_neon.S | 205 | ||||
-rw-r--r-- | simd/jsimd_i386.c | 73 | ||||
-rw-r--r-- | simd/jsimd_x86_64.c | 54 | ||||
-rw-r--r-- | simd/jsimdext.inc | 14 |
26 files changed, 546 insertions, 41 deletions
diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt index 263579d..397a9f3 100755 --- a/simd/CMakeLists.txt +++ b/simd/CMakeLists.txt @@ -18,16 +18,16 @@ if(CMAKE_BUILD_TYPE STREQUAL "Debug" endif() if(SIMD_X86_64) - set(SIMD_BASENAMES jfsseflt-64 jccolss2-64 jdcolss2-64 jcsamss2-64 - jdsamss2-64 jdmerss2-64 jcqnts2i-64 jfss2fst-64 jfss2int-64 jiss2red-64 - jiss2int-64 jiss2fst-64 jcqnts2f-64 jiss2flt-64) + set(SIMD_BASENAMES jfsseflt-64 jccolss2-64 jdcolss2-64 jcgrass2-64 + jcsamss2-64 jdsamss2-64 jdmerss2-64 jcqnts2i-64 jfss2fst-64 jfss2int-64 + jiss2red-64 jiss2int-64 jiss2fst-64 jcqnts2f-64 jiss2flt-64) message(STATUS "Building x86_64 SIMD extensions") else() - set(SIMD_BASENAMES jsimdcpu jccolmmx jdcolmmx jcsammmx jdsammmx jdmermmx - jcqntmmx jfmmxfst jfmmxint jimmxred jimmxint jimmxfst jcqnt3dn jf3dnflt - ji3dnflt jcqntsse jfsseflt jisseflt jccolss2 jdcolss2 jcsamss2 jdsamss2 - jdmerss2 jcqnts2i jfss2fst jfss2int jiss2red jiss2int jiss2fst jcqnts2f - jiss2flt) + set(SIMD_BASENAMES jsimdcpu jccolmmx jcgrammx jdcolmmx jcsammmx jdsammmx + jdmermmx jcqntmmx jfmmxfst jfmmxint jimmxred jimmxint jimmxfst jcqnt3dn + jf3dnflt ji3dnflt jcqntsse jfsseflt jisseflt jccolss2 jcgrass2 jdcolss2 + jcsamss2 jdsamss2 jdmerss2 jcqnts2i jfss2fst jfss2int jiss2red jiss2int + jiss2fst jcqnts2f jiss2flt) message(STATUS "Building i386 SIMD extensions") endif() @@ -48,6 +48,10 @@ foreach(file ${SIMD_BASENAMES}) set(DEPFILE ${SIMD_SRC}) string(REGEX REPLACE "mer" "mrg" DEPFILE ${DEPFILE}) endif() + if(${file} MATCHES gra) + set(DEPFILE ${SIMD_SRC}) + string(REGEX REPLACE "gra" "gry" DEPFILE ${DEPFILE}) + endif() set(SIMD_OBJ ${OBJDIR}/${file}.obj) add_custom_command(OUTPUT ${SIMD_OBJ} DEPENDS ${SIMD_SRC} ${DEPFILE} *.inc COMMAND ${NASM} ${NAFLAGS} ${SIMD_SRC} -o${SIMD_OBJ}) diff --git a/simd/Makefile.am b/simd/Makefile.am index 81c23af..fbba807 100644 --- a/simd/Makefile.am +++ b/simd/Makefile.am @@ -4,14 +4,14 @@ BUILT_SOURCES = jsimdcfg.inc EXTRA_DIST = nasm_lt.sh jcclrmmx.asm jcclrss2.asm jdclrmmx.asm jdclrss2.asm \ jdmrgmmx.asm jdmrgss2.asm jcclrss2-64.asm jdclrss2-64.asm \ - jdmrgss2-64.asm CMakeLists.txt + jdmrgss2-64.asm jcgryss2-64.asm jcgrymmx.asm jcgryss2.asm CMakeLists.txt if SIMD_X86_64 libsimd_la_SOURCES = jsimd_x86_64.c \ jsimd.h jsimdcfg.inc.h \ jsimdext.inc jcolsamp.inc jdct.inc \ - jfsseflt-64.asm \ + jfsseflt-64.asm jcgrass2-64.asm \ jccolss2-64.asm jdcolss2-64.asm \ jcsamss2-64.asm jdsamss2-64.asm jdmerss2-64.asm \ jcqnts2i-64.asm jfss2fst-64.asm jfss2int-64.asm \ @@ -20,6 +20,7 @@ libsimd_la_SOURCES = jsimd_x86_64.c \ jccolss2-64.lo: jcclrss2-64.asm jdcolss2-64.lo: jdclrss2-64.asm +jcgrass2-64.lo: jcgryss2-64.asm jdmerss2-64.lo: jdmrgss2-64.asm endif @@ -29,26 +30,34 @@ libsimd_la_SOURCES = jsimd_i386.c \ jsimd.h jsimdcfg.inc.h \ jsimdext.inc jcolsamp.inc jdct.inc \ jsimdcpu.asm \ - jccolmmx.asm jdcolmmx.asm \ + jccolmmx.asm jdcolmmx.asm jcgrammx.asm \ jcsammmx.asm jdsammmx.asm jdmermmx.asm \ jcqntmmx.asm jfmmxfst.asm jfmmxint.asm \ jimmxred.asm jimmxint.asm jimmxfst.asm \ jcqnt3dn.asm jf3dnflt.asm ji3dnflt.asm \ jcqntsse.asm jfsseflt.asm jisseflt.asm \ - jccolss2.asm jdcolss2.asm \ + jccolss2.asm jdcolss2.asm jcgrass2.asm \ jcsamss2.asm jdsamss2.asm jdmerss2.asm \ jcqnts2i.asm jfss2fst.asm jfss2int.asm \ jiss2red.asm jiss2int.asm jiss2fst.asm \ jcqnts2f.asm jiss2flt.asm jccolmmx.lo: jcclrmmx.asm +jcgrammx.lo: jcgrymmx.asm jccolss2.lo: jcclrss2.asm +jcgrass2.lo: jcgryss2.asm jdcolmmx.lo: jdclrmmx.asm jdcolss2.lo: jdclrss2.asm jdmermmx.lo: jdmrgmmx.asm jdmerss2.lo: jdmrgss2.asm endif +if SIMD_ARM + +libsimd_la_SOURCES = jsimd_arm.c jsimd_arm_neon.S + +endif + AM_CPPFLAGS = -I$(top_srcdir) .asm.lo: diff --git a/simd/jcclrmmx.asm b/simd/jcclrmmx.asm index b6b8912..e095253 100644 --- a/simd/jcclrmmx.asm +++ b/simd/jcclrmmx.asm @@ -19,8 +19,6 @@ %include "jcolsamp.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 ; ; Convert some rows of samples to the output colorspace. ; diff --git a/simd/jcclrss2-64.asm b/simd/jcclrss2-64.asm index 8ca47aa..f5d6bed 100644 --- a/simd/jcclrss2-64.asm +++ b/simd/jcclrss2-64.asm @@ -17,8 +17,6 @@ %include "jcolsamp.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 ; ; Convert some rows of samples to the output colorspace. ; diff --git a/simd/jcclrss2.asm b/simd/jcclrss2.asm index 8def718..517b705 100644 --- a/simd/jcclrss2.asm +++ b/simd/jcclrss2.asm @@ -16,8 +16,6 @@ %include "jcolsamp.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 ; ; Convert some rows of samples to the output colorspace. ; diff --git a/simd/jccolmmx.asm b/simd/jccolmmx.asm index 5e7f3be..d0f0d79 100644 --- a/simd/jccolmmx.asm +++ b/simd/jccolmmx.asm @@ -51,6 +51,9 @@ PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) alignz 16 ; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + %include "jcclrmmx.asm" %undef RGB_RED diff --git a/simd/jccolss2-64.asm b/simd/jccolss2-64.asm index 64ee0ba..18de456 100644 --- a/simd/jccolss2-64.asm +++ b/simd/jccolss2-64.asm @@ -48,6 +48,9 @@ PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) alignz 16 ; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + %include "jcclrss2-64.asm" %undef RGB_RED diff --git a/simd/jccolss2.asm b/simd/jccolss2.asm index 8d1f734..7acb59c 100644 --- a/simd/jccolss2.asm +++ b/simd/jccolss2.asm @@ -48,6 +48,9 @@ PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) alignz 16 ; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + %include "jcclrss2.asm" %undef RGB_RED diff --git a/simd/jdclrmmx.asm b/simd/jdclrmmx.asm index 79772e0..1c255e8 100644 --- a/simd/jdclrmmx.asm +++ b/simd/jdclrmmx.asm @@ -19,8 +19,6 @@ %include "jcolsamp.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 ; ; Convert some rows of samples to the output colorspace. ; diff --git a/simd/jdclrss2-64.asm b/simd/jdclrss2-64.asm index 4282bd2..fdb33a3 100644 --- a/simd/jdclrss2-64.asm +++ b/simd/jdclrss2-64.asm @@ -20,8 +20,6 @@ %include "jcolsamp.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 ; ; Convert some rows of samples to the output colorspace. ; diff --git a/simd/jdclrss2.asm b/simd/jdclrss2.asm index 865fa82..3059d7d 100644 --- a/simd/jdclrss2.asm +++ b/simd/jdclrss2.asm @@ -19,8 +19,6 @@ %include "jcolsamp.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 ; ; Convert some rows of samples to the output colorspace. ; diff --git a/simd/jdcolmmx.asm b/simd/jdcolmmx.asm index 58775e8..0834bab 100644 --- a/simd/jdcolmmx.asm +++ b/simd/jdcolmmx.asm @@ -48,6 +48,9 @@ PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) alignz 16 ; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + %include "jdclrmmx.asm" %undef RGB_RED diff --git a/simd/jdcolss2-64.asm b/simd/jdcolss2-64.asm index 2e97d59..d14a28a 100644 --- a/simd/jdcolss2-64.asm +++ b/simd/jdcolss2-64.asm @@ -48,6 +48,9 @@ PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) alignz 16 ; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + %include "jdclrss2-64.asm" %undef RGB_RED diff --git a/simd/jdcolss2.asm b/simd/jdcolss2.asm index 7ae985d..cab4dd0 100644 --- a/simd/jdcolss2.asm +++ b/simd/jdcolss2.asm @@ -48,6 +48,9 @@ PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) alignz 16 ; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + %include "jdclrss2.asm" %undef RGB_RED diff --git a/simd/jdmermmx.asm b/simd/jdmermmx.asm index fd587fb..75baaa5 100644 --- a/simd/jdmermmx.asm +++ b/simd/jdmermmx.asm @@ -48,6 +48,9 @@ PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) alignz 16 ; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + %include "jdmrgmmx.asm" %undef RGB_RED diff --git a/simd/jdmerss2-64.asm b/simd/jdmerss2-64.asm index 1f0b10f..a1fe963 100644 --- a/simd/jdmerss2-64.asm +++ b/simd/jdmerss2-64.asm @@ -48,6 +48,9 @@ PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) alignz 16 ; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + %include "jdmrgss2-64.asm" %undef RGB_RED diff --git a/simd/jdmerss2.asm b/simd/jdmerss2.asm index 2294e0d..21881b4 100644 --- a/simd/jdmerss2.asm +++ b/simd/jdmerss2.asm @@ -48,6 +48,9 @@ PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) alignz 16 ; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + %include "jdmrgss2.asm" %undef RGB_RED diff --git a/simd/jdmrgmmx.asm b/simd/jdmrgmmx.asm index b5777a3..d0800a7 100644 --- a/simd/jdmrgmmx.asm +++ b/simd/jdmrgmmx.asm @@ -19,8 +19,6 @@ %include "jcolsamp.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 ; ; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. ; diff --git a/simd/jdmrgss2-64.asm b/simd/jdmrgss2-64.asm index 121bb82..0c2503f 100644 --- a/simd/jdmrgss2-64.asm +++ b/simd/jdmrgss2-64.asm @@ -20,8 +20,6 @@ %include "jcolsamp.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 ; ; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. ; diff --git a/simd/jdmrgss2.asm b/simd/jdmrgss2.asm index 99b7eb9..368ac3c 100644 --- a/simd/jdmrgss2.asm +++ b/simd/jdmrgss2.asm @@ -19,8 +19,6 @@ %include "jcolsamp.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 ; ; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. ; diff --git a/simd/jsimd.h b/simd/jsimd.h index 89ac1b7..39a0867 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -2,6 +2,7 @@ * simd/jsimd.h * * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright 2011 D. R. Commander * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -11,11 +12,12 @@ /* Bitmask for supported acceleration methods */ -#define JSIMD_NONE 0x00 -#define JSIMD_MMX 0x01 -#define JSIMD_3DNOW 0x02 -#define JSIMD_SSE 0x04 -#define JSIMD_SSE2 0x08 +#define JSIMD_NONE 0x00 +#define JSIMD_MMX 0x01 +#define JSIMD_3DNOW 0x02 +#define JSIMD_SSE 0x04 +#define JSIMD_SSE2 0x08 +#define JSIMD_ARM_NEON 0x10 /* Short forms of external names for systems with brain-damaged linkers. */ @@ -28,6 +30,13 @@ #define jsimd_extbgrx_ycc_convert_mmx jSEXTBGRXYCCM #define jsimd_extxbgr_ycc_convert_mmx jSEXTXBGRYCCM #define jsimd_extxrgb_ycc_convert_mmx jSEXTXRGBYCCM +#define jsimd_rgb_gray_convert_mmx jSRGBGRYM +#define jsimd_extrgb_gray_convert_mmx jSEXTRGBGRYM +#define jsimd_extrgbx_gray_convert_mmx jSEXTRGBXGRYM +#define jsimd_extbgr_gray_convert_mmx jSEXTBGRGRYM +#define jsimd_extbgrx_gray_convert_mmx jSEXTBGRXGRYM +#define jsimd_extxbgr_gray_convert_mmx jSEXTXBGRGRYM +#define jsimd_extxrgb_gray_convert_mmx jSEXTXRGBGRYM #define jsimd_ycc_rgb_convert_mmx jSYCCRGBM #define jsimd_ycc_extrgb_convert_mmx jSYCCEXTRGBM #define jsimd_ycc_extrgbx_convert_mmx jSYCCEXTRGBXM @@ -43,6 +52,14 @@ #define jsimd_extbgrx_ycc_convert_sse2 jSEXTBGRXYCCS2 #define jsimd_extxbgr_ycc_convert_sse2 jSEXTXBGRYCCS2 #define jsimd_extxrgb_ycc_convert_sse2 jSEXTXRGBYCCS2 +#define jconst_rgb_gray_convert_sse2 jSCRGBGRYS2 +#define jsimd_rgb_gray_convert_sse2 jSRGBGRYS2 +#define jsimd_extrgb_gray_convert_sse2 jSEXTRGBGRYS2 +#define jsimd_extrgbx_gray_convert_sse2 jSEXTRGBXGRYS2 +#define jsimd_extbgr_gray_convert_sse2 jSEXTBGRGRYS2 +#define jsimd_extbgrx_gray_convert_sse2 jSEXTBGRXGRYS2 +#define jsimd_extxbgr_gray_convert_sse2 jSEXTXBGRGRYS2 +#define jsimd_extxrgb_gray_convert_sse2 jSEXTXRGBGRYS2 #define jconst_ycc_rgb_convert_sse2 jSCYCCRGBS2 #define jsimd_ycc_rgb_convert_sse2 jSYCCRGBS2 #define jsimd_ycc_extrgb_convert_sse2 jSYCCEXTRGBS2 @@ -163,6 +180,35 @@ EXTERN(void) jsimd_extxrgb_ycc_convert_mmx JSAMPARRAY input_buf, JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_rgb_gray_convert_mmx + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extrgb_gray_convert_mmx + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extrgbx_gray_convert_mmx + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extbgr_gray_convert_mmx + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extbgrx_gray_convert_mmx + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extxbgr_gray_convert_mmx + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extxrgb_gray_convert_mmx + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); + EXTERN(void) jsimd_ycc_rgb_convert_mmx JPP((JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, @@ -222,6 +268,36 @@ EXTERN(void) jsimd_extxrgb_ycc_convert_sse2 JSAMPARRAY input_buf, JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)); +extern const int jconst_rgb_gray_convert_sse2[]; +EXTERN(void) jsimd_rgb_gray_convert_sse2 + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extrgb_gray_convert_sse2 + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extrgbx_gray_convert_sse2 + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extbgr_gray_convert_sse2 + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extbgrx_gray_convert_sse2 + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extxbgr_gray_convert_sse2 + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extxrgb_gray_convert_sse2 + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); + extern const int jconst_ycc_rgb_convert_sse2[]; EXTERN(void) jsimd_ycc_rgb_convert_sse2 JPP((JDIMENSION out_width, @@ -252,6 +328,35 @@ EXTERN(void) jsimd_ycc_extxrgb_convert_sse2 JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)); +EXTERN(void) jsimd_ycc_rgb_convert_neon + JPP((JDIMENSION out_width, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows)); +EXTERN(void) jsimd_ycc_extrgb_convert_neon + JPP((JDIMENSION out_width, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows)); +EXTERN(void) jsimd_ycc_extrgbx_convert_neon + JPP((JDIMENSION out_width, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows)); +EXTERN(void) jsimd_ycc_extbgr_convert_neon + JPP((JDIMENSION out_width, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows)); +EXTERN(void) jsimd_ycc_extbgrx_convert_neon + JPP((JDIMENSION out_width, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows)); +EXTERN(void) jsimd_ycc_extxbgr_convert_neon + JPP((JDIMENSION out_width, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows)); +EXTERN(void) jsimd_ycc_extxrgb_convert_neon + JPP((JDIMENSION out_width, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows)); + /* SIMD Downsample */ EXTERN(void) jsimd_h2v2_downsample_mmx JPP((JDIMENSION image_width, int max_v_samp_factor, @@ -464,6 +569,15 @@ EXTERN(void) jsimd_idct_4x4_sse2 JPP((void * dct_table, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jsimd_idct_2x2_neon JPP((void * dct_table, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, + JDIMENSION output_col)); +EXTERN(void) jsimd_idct_4x4_neon JPP((void * dct_table, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, + JDIMENSION output_col)); + /* SIMD Inverse DCT */ EXTERN(void) jsimd_idct_islow_mmx JPP((void * dct_table, JCOEFPTR coef_block, @@ -485,6 +599,11 @@ EXTERN(void) jsimd_idct_ifast_sse2 JPP((void * dct_table, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jsimd_idct_ifast_neon JPP((void * dct_table, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, + JDIMENSION output_col)); + EXTERN(void) jsimd_idct_float_3dnow JPP((void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, diff --git a/simd/jsimd_arm.c b/simd/jsimd_arm.c index a9d920c..9ed49fd 100644 --- a/simd/jsimd_arm.c +++ b/simd/jsimd_arm.c @@ -136,6 +136,7 @@ jsimd_can_rgb_ycc (void) { init_simd(); +<<<<<<< HEAD /* The code is optimised for these values only */ if (BITS_IN_JSAMPLE != 8) return 0; @@ -147,6 +148,8 @@ jsimd_can_rgb_ycc (void) if (simd_support & JSIMD_ARM_NEON) return 1; +======= +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 return 0; } @@ -181,6 +184,7 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows) { +<<<<<<< HEAD void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); switch(cinfo->in_color_space) @@ -211,6 +215,8 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo, if (simd_support & JSIMD_ARM_NEON) neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +======= +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 } GLOBAL(void) @@ -386,6 +392,7 @@ jsimd_can_convsamp (void) { init_simd(); +<<<<<<< HEAD /* The code is optimised for these values only */ if (DCTSIZE != 8) return 0; @@ -399,6 +406,8 @@ jsimd_can_convsamp (void) if (simd_support & JSIMD_ARM_NEON) return 1; +======= +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 return 0; } @@ -414,8 +423,11 @@ GLOBAL(void) jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace) { +<<<<<<< HEAD if (simd_support & JSIMD_ARM_NEON) jsimd_convsamp_neon(sample_data, start_col, workspace); +======= +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 } GLOBAL(void) @@ -437,6 +449,7 @@ jsimd_can_fdct_ifast (void) { init_simd(); +<<<<<<< HEAD /* The code is optimised for these values only */ if (DCTSIZE != 8) return 0; @@ -446,6 +459,8 @@ jsimd_can_fdct_ifast (void) if (simd_support & JSIMD_ARM_NEON) return 1; +======= +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 return 0; } @@ -465,8 +480,11 @@ jsimd_fdct_islow (DCTELEM * data) GLOBAL(void) jsimd_fdct_ifast (DCTELEM * data) { +<<<<<<< HEAD if (simd_support & JSIMD_ARM_NEON) jsimd_fdct_ifast_neon(data); +======= +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 } GLOBAL(void) @@ -479,6 +497,7 @@ jsimd_can_quantize (void) { init_simd(); +<<<<<<< HEAD /* The code is optimised for these values only */ if (DCTSIZE != 8) return 0; @@ -490,6 +509,8 @@ jsimd_can_quantize (void) if (simd_support & JSIMD_ARM_NEON) return 1; +======= +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 return 0; } @@ -505,8 +526,11 @@ GLOBAL(void) jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace) { +<<<<<<< HEAD if (simd_support & JSIMD_ARM_NEON) jsimd_quantize_neon(coef_block, divisors, workspace); +======= +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 } GLOBAL(void) @@ -584,6 +608,7 @@ jsimd_can_idct_islow (void) { init_simd(); +<<<<<<< HEAD /* The code is optimised for these values only */ if (DCTSIZE != 8) return 0; @@ -599,6 +624,8 @@ jsimd_can_idct_islow (void) if (simd_support & JSIMD_ARM_NEON) return 1; +======= +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 return 0; } @@ -640,8 +667,11 @@ jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { +<<<<<<< HEAD if ((simd_support & JSIMD_ARM_NEON)) jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf, output_col); +======= +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 } GLOBAL(void) diff --git a/simd/jsimd_arm_neon.S b/simd/jsimd_arm_neon.S index 9ef6efc..3f1cf9e 100644 --- a/simd/jsimd_arm_neon.S +++ b/simd/jsimd_arm_neon.S @@ -62,6 +62,7 @@ _\fname: vtrn.32 \x1, \x3 .endm +<<<<<<< HEAD #define CENTERJSAMPLE 128 /*****************************************************************************/ @@ -536,6 +537,8 @@ asm_function jsimd_idct_islow_neon .unreq ROW7R .endfunc +======= +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 /*****************************************************************************/ /* @@ -543,6 +546,7 @@ asm_function jsimd_idct_islow_neon * * This function contains a fast, not so accurate integer implementation of * the inverse DCT (Discrete Cosine Transform). It uses the same calculations +<<<<<<< HEAD * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' * function from jidctfst.c * @@ -552,6 +556,12 @@ asm_function jsimd_idct_islow_neon * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", * which introduces an extra addition. Overall, there are 6 extra additions * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. +======= + * and produces exactly the same output as IJG's original 'jpeg_idct_fast' + * function from jidctfst.c + * + * TODO: a bit better instructions scheduling is needed. +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 */ #define XFIX_1_082392200 d0[0] @@ -566,12 +576,70 @@ jsimd_idct_ifast_neon_consts: .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ +<<<<<<< HEAD +======= +/* 1-D IDCT helper macro */ + +.macro idct_helper x0, x1, x2, x3, x4, x5, x6, x7, \ + t10, t11, t12, t13, t14 + + vsub.s16 \t10, \x0, \x4 + vadd.s16 \x4, \x0, \x4 + vswp.s16 \t10, \x0 + vsub.s16 \t11, \x2, \x6 + vadd.s16 \x6, \x2, \x6 + vswp.s16 \t11, \x2 + vsub.s16 \t10, \x3, \x5 + vadd.s16 \x5, \x3, \x5 + vswp.s16 \t10, \x3 + vsub.s16 \t11, \x1, \x7 + vadd.s16 \x7, \x1, \x7 + vswp.s16 \t11, \x1 + + vqdmulh.s16 \t13, \x2, d0[1] + vadd.s16 \t12, \x3, \x3 + vadd.s16 \x2, \x2, \t13 + vqdmulh.s16 \t13, \x3, d0[3] + vsub.s16 \t10, \x1, \x3 + vadd.s16 \t12, \t12, \t13 + vqdmulh.s16 \t13, \t10, d0[2] + vsub.s16 \t11, \x7, \x5 + vadd.s16 \t10, \t10, \t13 + vqdmulh.s16 \t13, \t11, d0[1] + vadd.s16 \t11, \t11, \t13 + + vqdmulh.s16 \t13, \x1, d0[0] + vsub.s16 \x2, \x6, \x2 + vsub.s16 \t14, \x0, \x2 + vadd.s16 \x2, \x0, \x2 + vadd.s16 \x0, \x4, \x6 + vsub.s16 \x4, \x4, \x6 + vadd.s16 \x1, \x1, \t13 + vadd.s16 \t13, \x7, \x5 + vsub.s16 \t12, \t13, \t12 + vsub.s16 \t12, \t12, \t10 + vadd.s16 \t11, \t12, \t11 + vsub.s16 \t10, \x1, \t10 + vadd.s16 \t10, \t10, \t11 + + vsub.s16 \x7, \x0, \t13 + vadd.s16 \x0, \x0, \t13 + vadd.s16 \x6, \t14, \t12 + vsub.s16 \x1, \t14, \t12 + vsub.s16 \x5, \x2, \t11 + vadd.s16 \x2, \x2, \t11 + vsub.s16 \x3, \x4, \t10 + vadd.s16 \x4, \x4, \t10 +.endm + +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 asm_function jsimd_idct_ifast_neon DCT_TABLE .req r0 COEF_BLOCK .req r1 OUTPUT_BUF .req r2 OUTPUT_COL .req r3 +<<<<<<< HEAD TMP1 .req r0 TMP2 .req r1 TMP3 .req r2 @@ -756,18 +824,117 @@ asm_function jsimd_idct_ifast_neon vst1.8 {d21}, [TMP2] vst1.8 {d22}, [TMP3] vst1.8 {d23}, [TMP4] +======= + TMP .req ip + + vpush {d8-d15} + + /* Load constants */ + adr TMP, jsimd_idct_ifast_neon_consts + vld1.16 {d0}, [TMP, :64] + + /* Load all COEF_BLOCK into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d4 | d5 + * 1 | d6 | d7 + * 2 | d8 | d9 + * 3 | d10 | d11 + * 4 | d12 | d13 + * 5 | d14 | d15 + * 6 | d16 | d17 + * 7 | d18 | d19 + */ + vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK]! + vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK]! + vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK]! + vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK]! + /* Dequantize */ + vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]! + vmul.s16 q2, q2, q10 + vld1.16 {d24, d25, d26, d27}, [DCT_TABLE]! + vmul.s16 q3, q3, q11 + vmul.s16 q4, q4, q12 + vld1.16 {d28, d29, d30, d31}, [DCT_TABLE]! + vmul.s16 q5, q5, q13 + vmul.s16 q6, q6, q14 + vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]! + vmul.s16 q7, q7, q15 + vmul.s16 q8, q8, q10 + vmul.s16 q9, q9, q11 + + /* Pass 1 */ + idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14 + /* Transpose */ + transpose_4x4 d4, d6, d8, d10 + transpose_4x4 d5, d7, d9, d11 + transpose_4x4 d12, d14, d16, d18 + transpose_4x4 d13, d15, d17, d19 + vswp d12, d5 + vswp d14, d7 + vswp d16, d9 + vswp d18, d11 + + /* Pass 2 */ + idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14 + /* Transpose */ + transpose_4x4 d4, d6, d8, d10 + transpose_4x4 d5, d7, d9, d11 + transpose_4x4 d12, d14, d16, d18 + transpose_4x4 d13, d15, d17, d19 + vswp d12, d5 + vswp d14, d7 + vswp d16, d9 + vswp d18, d11 + + /* Descale and range limit */ + vmov.s16 q15, #(0x80 << 5) + vqadd.s16 q2, q2, q15 + vqadd.s16 q3, q3, q15 + vqadd.s16 q4, q4, q15 + vqadd.s16 q5, q5, q15 + vqadd.s16 q6, q6, q15 + vqadd.s16 q7, q7, q15 + vqadd.s16 q8, q8, q15 + vqadd.s16 q9, q9, q15 + vqshrun.s16 d4, q2, #5 + vqshrun.s16 d6, q3, #5 + vqshrun.s16 d8, q4, #5 + vqshrun.s16 d10, q5, #5 + vqshrun.s16 d12, q6, #5 + vqshrun.s16 d14, q7, #5 + vqshrun.s16 d16, q8, #5 + vqshrun.s16 d18, q9, #5 + + /* Store results to the output buffer */ + .irp x, d4, d6, d8, d10, d12, d14, d16, d18 + ldr TMP, [OUTPUT_BUF], #4 + add TMP, TMP, OUTPUT_COL + vst1.8 {\x}, [TMP]! + .endr + + vpop {d8-d15} +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 bx lr .unreq DCT_TABLE .unreq COEF_BLOCK .unreq OUTPUT_BUF .unreq OUTPUT_COL +<<<<<<< HEAD .unreq TMP1 .unreq TMP2 .unreq TMP3 .unreq TMP4 .endfunc +======= + .unreq TMP +.endfunc + +.purgem idct_helper + +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 /*****************************************************************************/ /* @@ -1152,12 +1319,21 @@ asm_function jsimd_idct_2x2_neon .macro do_load size .if \size == 8 +<<<<<<< HEAD vld1.8 {d4}, [U, :64]! vld1.8 {d5}, [V, :64]! vld1.8 {d0}, [Y, :64]! pld [U, #64] pld [V, #64] pld [Y, #64] +======= + vld1.8 {d4}, [U]! + vld1.8 {d5}, [V]! + vld1.8 {d0}, [Y]! + pld [Y, #64] + pld [U, #64] + pld [V, #64] +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 .elseif \size == 4 vld1.8 {d4[0]}, [U]! vld1.8 {d4[1]}, [U]! @@ -1227,11 +1403,15 @@ asm_function jsimd_idct_2x2_neon .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs +<<<<<<< HEAD /* * 2 stage pipelined YCbCr->RGB conversion */ .macro do_yuv_to_rgb_stage1 +======= +.macro do_yuv_to_rgb +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ @@ -1242,9 +1422,12 @@ asm_function jsimd_idct_2x2_neon vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ +<<<<<<< HEAD .endm .macro do_yuv_to_rgb_stage2 +======= +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 vrshrn.s32 d20, q10, #15 vrshrn.s32 d21, q11, #15 vrshrn.s32 d24, q12, #14 @@ -1259,6 +1442,7 @@ asm_function jsimd_idct_2x2_neon vqmovun.s16 d1\b_offs, q14 .endm +<<<<<<< HEAD .macro do_yuv_to_rgb_stage2_store_load_stage1 vld1.8 {d4}, [U, :64]! vrshrn.s32 d20, q10, #15 @@ -1296,6 +1480,8 @@ asm_function jsimd_idct_2x2_neon do_yuv_to_rgb_stage2 .endm +======= +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 /* Apple gas crashes on adrl, work around that by using adr. * But this requires a copy of these constants for each function. */ @@ -1356,6 +1542,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon /* Inner loop over pixels */ subs N, N, #8 +<<<<<<< HEAD blt 3f do_load 8 do_yuv_to_rgb_stage1 @@ -1371,6 +1558,18 @@ asm_function jsimd_ycc_\colorid\()_convert_neon tst N, #7 beq 8f 3: +======= + blt 2f +1: + do_load 8 + do_yuv_to_rgb + do_store \bpp, 8 + subs N, N, #8 + bge 1b + tst N, #7 + beq 8f +2: +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 tst N, #4 beq 3f do_load 4 @@ -1418,9 +1617,12 @@ asm_function jsimd_ycc_\colorid\()_convert_neon .endfunc .purgem do_yuv_to_rgb +<<<<<<< HEAD .purgem do_yuv_to_rgb_stage1 .purgem do_yuv_to_rgb_stage2 .purgem do_yuv_to_rgb_stage2_store_load_stage1 +======= +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 .endm @@ -1436,6 +1638,7 @@ generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 .purgem do_store /*****************************************************************************/ +<<<<<<< HEAD /* * jsimd_extrgb_ycc_convert_neon @@ -2031,3 +2234,5 @@ asm_function jsimd_quantize_neon .unreq SHIFT .unreq LOOP_COUNT .endfunc +======= +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 diff --git a/simd/jsimd_i386.c b/simd/jsimd_i386.c index d9bb774..f77c5ef 100644 --- a/simd/jsimd_i386.c +++ b/simd/jsimd_i386.c @@ -2,7 +2,7 @@ * jsimd_i386.c * * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB - * Copyright 2009 D. R. Commander + * Copyright 2009-2011 D. R. Commander * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -84,6 +84,28 @@ jsimd_can_rgb_ycc (void) } GLOBAL(int) +jsimd_can_rgb_gray (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) jsimd_can_ycc_rgb (void) { init_simd(); @@ -155,6 +177,55 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo, } GLOBAL(void) +jsimd_rgb_gray_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) + { + case JCS_EXT_RGB: + sse2fct=jsimd_extrgb_gray_convert_sse2; + mmxfct=jsimd_extrgb_gray_convert_mmx; + break; + case JCS_EXT_RGBX: + sse2fct=jsimd_extrgbx_gray_convert_sse2; + mmxfct=jsimd_extrgbx_gray_convert_mmx; + break; + case JCS_EXT_BGR: + sse2fct=jsimd_extbgr_gray_convert_sse2; + mmxfct=jsimd_extbgr_gray_convert_mmx; + break; + case JCS_EXT_BGRX: + sse2fct=jsimd_extbgrx_gray_convert_sse2; + mmxfct=jsimd_extbgrx_gray_convert_mmx; + break; + case JCS_EXT_XBGR: + sse2fct=jsimd_extxbgr_gray_convert_sse2; + mmxfct=jsimd_extxbgr_gray_convert_mmx; + break; + case JCS_EXT_XRGB: + sse2fct=jsimd_extxrgb_gray_convert_sse2; + mmxfct=jsimd_extxrgb_gray_convert_mmx; + break; + default: + sse2fct=jsimd_rgb_gray_convert_sse2; + mmxfct=jsimd_rgb_gray_convert_mmx; + break; + } + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2)) + sse2fct(cinfo->image_width, input_buf, + output_buf, output_row, num_rows); + else if (simd_support & JSIMD_MMX) + mmxfct(cinfo->image_width, input_buf, + output_buf, output_row, num_rows); +} + +GLOBAL(void) jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows) diff --git a/simd/jsimd_x86_64.c b/simd/jsimd_x86_64.c index 7659249..2951268 100644 --- a/simd/jsimd_x86_64.c +++ b/simd/jsimd_x86_64.c @@ -2,7 +2,7 @@ * jsimd_x86_64.c * * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB - * Copyright 2009 D. R. Commander + * Copyright 2009-2011 D. R. Commander * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -47,6 +47,23 @@ jsimd_can_rgb_ycc (void) } GLOBAL(int) +jsimd_can_rgb_gray (void) +{ + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (!IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2)) + return 0; + + return 1; +} + +GLOBAL(int) jsimd_can_ycc_rgb (void) { /* The code is optimised for these values only */ @@ -99,6 +116,41 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo, } GLOBAL(void) +jsimd_rgb_gray_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) + { + case JCS_EXT_RGB: + sse2fct=jsimd_extrgb_gray_convert_sse2; + break; + case JCS_EXT_RGBX: + sse2fct=jsimd_extrgbx_gray_convert_sse2; + break; + case JCS_EXT_BGR: + sse2fct=jsimd_extbgr_gray_convert_sse2; + break; + case JCS_EXT_BGRX: + sse2fct=jsimd_extbgrx_gray_convert_sse2; + break; + case JCS_EXT_XBGR: + sse2fct=jsimd_extxbgr_gray_convert_sse2; + break; + case JCS_EXT_XRGB: + sse2fct=jsimd_extxrgb_gray_convert_sse2; + break; + default: + sse2fct=jsimd_rgb_gray_convert_sse2; + break; + } + + sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows) diff --git a/simd/jsimdext.inc b/simd/jsimdext.inc index c4297f9..635a931 100644 --- a/simd/jsimdext.inc +++ b/simd/jsimdext.inc @@ -38,16 +38,26 @@ ; -- segment definition -- ; +%ifdef __YASM_VER__ +%define SEG_TEXT .text align=16 +%define SEG_CONST .rdata align=16 +%else %define SEG_TEXT .text align=16 public use32 class=CODE %define SEG_CONST .rdata align=16 public use32 class=CONST +%endif %elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)-------- ; * Microsoft Visual C++ ; -- segment definition -- ; +%ifdef __YASM_VER__ +%define SEG_TEXT .text align=16 +%define SEG_CONST .rdata align=16 +%else %define SEG_TEXT .text align=16 public use64 class=CODE %define SEG_CONST .rdata align=16 public use64 class=CONST +%endif %define EXTN(name) name ; foo() -> foo %elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)---------- @@ -297,8 +307,6 @@ const_base: %ifdef WIN64 %imacro collect_args 0 - push r10 - push r11 push r12 push r13 push r14 @@ -328,8 +336,6 @@ const_base: pop r14 pop r13 pop r12 - pop r11 - pop r10 %endmacro %else |