diff options
-rw-r--r-- | Makefile.am | 2 | ||||
-rw-r--r-- | jccolor.c | 104 | ||||
-rw-r--r-- | jsimd.h | 7 | ||||
-rw-r--r-- | jsimd_none.c | 13 | ||||
-rw-r--r-- | simd/Makefile.am | 9 | ||||
-rw-r--r-- | simd/jcgrammx.asm | 113 | ||||
-rw-r--r-- | simd/jcgrass2-64.asm | 110 | ||||
-rw-r--r-- | simd/jcgrass2.asm | 110 | ||||
-rw-r--r-- | simd/jcgrymmx.asm | 359 | ||||
-rw-r--r-- | simd/jcgryss2-64.asm | 366 | ||||
-rw-r--r-- | simd/jcgryss2.asm | 385 | ||||
-rw-r--r-- | simd/jsimd.h | 67 | ||||
-rw-r--r-- | simd/jsimd_i386.c | 71 | ||||
-rw-r--r-- | simd/jsimd_x86_64.c | 52 |
14 files changed, 1676 insertions, 92 deletions
diff --git a/Makefile.am b/Makefile.am index ae5bdc3..10a7099 100644 --- a/Makefile.am +++ b/Makefile.am @@ -148,6 +148,8 @@ if WITH_SIMD else cmp $(srcdir)/testimgflt-nosimd.jpg testoutflt.jpg endif + ./cjpeg -dct int -grayscale -outfile testoutgray.jpg $(srcdir)/testorig.ppm + cmp $(srcdir)/testimggray.jpg testoutgray.jpg ./djpeg -dct int -fast -ppm -outfile testoutint.ppm $(srcdir)/testorig.jpg cmp $(srcdir)/testimgint.ppm testoutint.ppm ./djpeg -dct fast -ppm -outfile testoutfst.ppm $(srcdir)/testorig.jpg @@ -81,74 +81,6 @@ typedef my_color_converter * my_cconvert_ptr; #define TABLE_SIZE (8*(MAXJSAMPLE+1)) -#if BITS_IN_JSAMPLE == 8 - -static const unsigned char red_lut[256] = { - 0 , 0 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 3 , 3 , 3 , 4 , 4 , 4 , 4 , - 5 , 5 , 5 , 6 , 6 , 6 , 7 , 7 , 7 , 7 , 8 , 8 , 8 , 9 , 9 , 9 , - 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14, - 14, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, - 19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 24, - 24, 24, 25, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 28, - 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33, 33, - 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 38, 38, - 38, 39, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 42, 43, - 43, 43, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48, - 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 51, 51, 51, 52, 52, 52, - 53, 53, 53, 54, 54, 54, 54, 55, 55, 55, 56, 56, 56, 57, 57, 57, - 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 60, 61, 61, 61, 62, 62, - 62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 67, - 67, 67, 68, 68, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 71, - 72, 72, 72, 73, 73, 73, 74, 74, 74, 74, 75, 75, 75, 76, 76, 76 -}; - -static const unsigned char green_lut[256] = { - 0 , 1 , 1 , 2 , 2 , 3 , 4 , 4 , 5 , 5 , 6 , 6 , - 7 , 8 , 8 , 9 , 9 , 10 , 11 , 11 , 12 , 12 , 13 , 14 , - 14 , 15 , 15 , 16 , 16 , 17 , 18 , 18 , 19 , 19 , 20 , 21 , - 21 , 22 , 22 , 23 , 23 , 24 , 25 , 25 , 26 , 26 , 27 , 28 , - 28 , 29 , 29 , 30 , 31 , 31 , 32 , 32 , 33 , 33 , 34 , 35 , - 35 , 36 , 36 , 37 , 38 , 38 , 39 , 39 , 40 , 41 , 41 , 42 , - 42 , 43 , 43 , 44 , 45 , 45 , 46 , 46 , 47 , 48 , 48 , 49 , - 49 , 50 , 50 , 51 , 52 , 52 , 53 , 53 , 54 , 55 , 55 , 56 , - 56 , 57 , 58 , 58 , 59 , 59 , 60 , 60 , 61 , 62 , 62 , 63 , - 63 , 64 , 65 , 65 , 66 , 66 , 67 , 68 , 68 , 69 , 69 , 70 , - 70 , 71 , 72 , 72 , 73 , 73 , 74 , 75 , 75 , 76 , 76 , 77 , - 77 , 78 , 79 , 79 , 80 , 80 , 81 , 82 , 82 , 83 , 83 , 84 , - 85 , 85 , 86 , 86 , 87 , 87 , 88 , 89 , 89 , 90 , 90 , 91 , - 92 , 92 , 93 , 93 , 94 , 95 , 95 , 96 , 96 , 97 , 97 , 98 , - 99 , 99 , 100, 100, 101, 102, 102, 103, 103, 104, 104, 105, - 106, 106, 107, 107, 108, 109, 109, 110, 110, 111, 112, 112, - 113, 113, 114, 114, 115, 116, 116, 117, 117, 118, 119, 119, - 120, 120, 121, 122, 122, 123, 123, 124, 124, 125, 126, 126, - 127, 127, 128, 129, 129, 130, 130, 131, 131, 132, 133, 133, - 34, 134, 135, 136, 136, 137, 137, 138, 139, 139, 140, 140, - 141, 141, 142, 143, 143, 144, 144, 145, 146, 146, 147, 147, - 148, 149, 149, 150 -}; - -static const unsigned char blue_lut[256] = { - 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 2 , 2 , - 2 , 2 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 4 , - 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , - 5 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 7 , 7 , 7 , 7 , 7 , 7 , - 7 , 7 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 9 , 9 , 9 , 9 , 9 , - 9 , 9 , 9 , 9 , 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, - 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, - 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, - 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, - 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, - 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, - 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, - 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 24, - 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25, - 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, - 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29 -}; - -#endif - - /* * Initialize for RGB->YCC colorspace conversion. */ @@ -259,36 +191,26 @@ rgb_gray_convert (j_compress_ptr cinfo, JDIMENSION output_row, int num_rows) { my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert; - #if BITS_IN_JSAMPLE != 8 + register int r, g, b; register INT32 * ctab = cconvert->rgb_ycc_tab; - #endif register JSAMPROW inptr; register JSAMPROW outptr; - JSAMPLE *maxoutptr; register JDIMENSION col; JDIMENSION num_cols = cinfo->image_width; - int rindex = rgb_red[cinfo->in_color_space]; - int gindex = rgb_green[cinfo->in_color_space]; - int bindex = rgb_blue[cinfo->in_color_space]; - int rgbstride = rgb_pixelsize[cinfo->in_color_space]; while (--num_rows >= 0) { inptr = *input_buf++; outptr = output_buf[0][output_row]; - maxoutptr = &outptr[num_cols]; output_row++; - for (; outptr < maxoutptr; outptr++, inptr += rgbstride) { + for (col = 0; col < num_cols; col++) { + r = GETJSAMPLE(inptr[rgb_red[cinfo->in_color_space]]); + g = GETJSAMPLE(inptr[rgb_green[cinfo->in_color_space]]); + b = GETJSAMPLE(inptr[rgb_blue[cinfo->in_color_space]]); + inptr += rgb_pixelsize[cinfo->in_color_space]; /* Y */ - #if BITS_IN_JSAMPLE == 8 - *outptr = red_lut[inptr[rindex]] + green_lut[inptr[gindex]] - + blue_lut[inptr[bindex]]; - #else - *outptr = (JSAMPLE) - ((ctab[GETJSAMPLE(inptr[rindex])+R_Y_OFF] - + ctab[GETJSAMPLE(inptr[gindex])+G_Y_OFF] - + ctab[GETJSAMPLE(inptr[bindex])+B_Y_OFF]) - >> SCALEBITS); - #endif + outptr[col] = (JSAMPLE) + ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF]) + >> SCALEBITS); } } } @@ -490,8 +412,12 @@ jinit_color_converter (j_compress_ptr cinfo) cinfo->in_color_space == JCS_EXT_BGRX || cinfo->in_color_space == JCS_EXT_XBGR || cinfo->in_color_space == JCS_EXT_XRGB) { - cconvert->pub.start_pass = rgb_ycc_start; - cconvert->pub.color_convert = rgb_gray_convert; + if (jsimd_can_rgb_gray()) + cconvert->pub.color_convert = jsimd_rgb_gray_convert; + else { + cconvert->pub.start_pass = rgb_ycc_start; + cconvert->pub.color_convert = rgb_gray_convert; + } } else if (cinfo->in_color_space == JCS_YCbCr) cconvert->pub.color_convert = grayscale_convert; else @@ -13,8 +13,10 @@ #ifdef NEED_SHORT_EXTERNAL_NAMES #define jsimd_can_rgb_ycc jSCanRgbYcc +#define jsimd_can_rgb_gray jSCanRgbGry #define jsimd_can_ycc_rgb jSCanYccRgb #define jsimd_rgb_ycc_convert jSRgbYccConv +#define jsimd_rgb_gray_convert jSRgbGryConv #define jsimd_ycc_rgb_convert jSYccRgbConv #define jsimd_can_h2v2_downsample jSCanH2V2Down #define jsimd_can_h2v1_downsample jSCanH2V1Down @@ -35,12 +37,17 @@ #endif /* NEED_SHORT_EXTERNAL_NAMES */ EXTERN(int) jsimd_can_rgb_ycc JPP((void)); +EXTERN(int) jsimd_can_rgb_gray JPP((void)); EXTERN(int) jsimd_can_ycc_rgb JPP((void)); EXTERN(void) jsimd_rgb_ycc_convert JPP((j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_rgb_gray_convert + JPP((j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); EXTERN(void) jsimd_ycc_rgb_convert JPP((j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row, diff --git a/jsimd_none.c b/jsimd_none.c index 7ff3074..6936a44 100644 --- a/jsimd_none.c +++ b/jsimd_none.c @@ -25,6 +25,12 @@ jsimd_can_rgb_ycc (void) } GLOBAL(int) +jsimd_can_rgb_gray (void) +{ + return 0; +} + +GLOBAL(int) jsimd_can_ycc_rgb (void) { return 0; @@ -38,6 +44,13 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo, } GLOBAL(void) +jsimd_rgb_gray_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ +} + +GLOBAL(void) jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows) diff --git a/simd/Makefile.am b/simd/Makefile.am index 81c23af..f19fdf5 100644 --- a/simd/Makefile.am +++ b/simd/Makefile.am @@ -11,7 +11,7 @@ if SIMD_X86_64 libsimd_la_SOURCES = jsimd_x86_64.c \ jsimd.h jsimdcfg.inc.h \ jsimdext.inc jcolsamp.inc jdct.inc \ - jfsseflt-64.asm \ + jfsseflt-64.asm jcgrass2-64.asm \ jccolss2-64.asm jdcolss2-64.asm \ jcsamss2-64.asm jdsamss2-64.asm jdmerss2-64.asm \ jcqnts2i-64.asm jfss2fst-64.asm jfss2int-64.asm \ @@ -20,6 +20,7 @@ libsimd_la_SOURCES = jsimd_x86_64.c \ jccolss2-64.lo: jcclrss2-64.asm jdcolss2-64.lo: jdclrss2-64.asm +jcgrass2-64.lo: jcgryss2-64.asm jdmerss2-64.lo: jdmrgss2-64.asm endif @@ -29,20 +30,22 @@ libsimd_la_SOURCES = jsimd_i386.c \ jsimd.h jsimdcfg.inc.h \ jsimdext.inc jcolsamp.inc jdct.inc \ jsimdcpu.asm \ - jccolmmx.asm jdcolmmx.asm \ + jccolmmx.asm jdcolmmx.asm jcgrammx.asm \ jcsammmx.asm jdsammmx.asm jdmermmx.asm \ jcqntmmx.asm jfmmxfst.asm jfmmxint.asm \ jimmxred.asm jimmxint.asm jimmxfst.asm \ jcqnt3dn.asm jf3dnflt.asm ji3dnflt.asm \ jcqntsse.asm jfsseflt.asm jisseflt.asm \ - jccolss2.asm jdcolss2.asm \ + jccolss2.asm jdcolss2.asm jcgrass2.asm \ jcsamss2.asm jdsamss2.asm jdmerss2.asm \ jcqnts2i.asm jfss2fst.asm jfss2int.asm \ jiss2red.asm jiss2int.asm jiss2fst.asm \ jcqnts2f.asm jiss2flt.asm jccolmmx.lo: jcclrmmx.asm +jcgrammx.lo: jcgrymmx.asm jccolss2.lo: jcclrss2.asm +jcgrass2.lo: jcgryss2.asm jdcolmmx.lo: jdclrmmx.asm jdcolss2.lo: jdclrss2.asm jdmermmx.lo: jdmrgmmx.asm diff --git a/simd/jcgrammx.asm b/simd/jcgrammx.asm new file mode 100644 index 0000000..dd46cc5 --- /dev/null +++ b/simd/jcgrammx.asm @@ -0,0 +1,113 @@ +; +; jcgrammx.asm - grayscale colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright 2011 D. R. Commander +; +; Based on +; x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_rgb_gray_convert_mmx) + +EXTN(jconst_rgb_gray_convert_mmx): + +PW_F0299_F0337 times 2 dw F_0_299, F_0_337 +PW_F0114_F0250 times 2 dw F_0_114, F_0_250 +PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) + + alignz 16 + +; -------------------------------------------------------------------------- +%include "jcgrymmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 0 +%define RGB_GREEN 1 +%define RGB_BLUE 2 +%define RGB_PIXELSIZE 3 +%define jsimd_rgb_gray_convert_mmx jsimd_extrgb_gray_convert_mmx +%include "jcgrymmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 0 +%define RGB_GREEN 1 +%define RGB_BLUE 2 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_mmx jsimd_extrgbx_gray_convert_mmx +%include "jcgrymmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 2 +%define RGB_GREEN 1 +%define RGB_BLUE 0 +%define RGB_PIXELSIZE 3 +%define jsimd_rgb_gray_convert_mmx jsimd_extbgr_gray_convert_mmx +%include "jcgrymmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 2 +%define RGB_GREEN 1 +%define RGB_BLUE 0 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_mmx jsimd_extbgrx_gray_convert_mmx +%include "jcgrymmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 3 +%define RGB_GREEN 2 +%define RGB_BLUE 1 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_mmx jsimd_extxbgr_gray_convert_mmx +%include "jcgrymmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 1 +%define RGB_GREEN 2 +%define RGB_BLUE 3 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_mmx jsimd_extxrgb_gray_convert_mmx +%include "jcgrymmx.asm" diff --git a/simd/jcgrass2-64.asm b/simd/jcgrass2-64.asm new file mode 100644 index 0000000..9f8a01a --- /dev/null +++ b/simd/jcgrass2-64.asm @@ -0,0 +1,110 @@ +; +; jcgrass2-64.asm - grayscale colorspace conversion (64-bit SSE2) +; +; x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; Copyright (C) 2011, D. R. Commander. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_rgb_gray_convert_sse2) + +EXTN(jconst_rgb_gray_convert_sse2): + +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) + + alignz 16 + +; -------------------------------------------------------------------------- +%include "jcgryss2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 0 +%define RGB_GREEN 1 +%define RGB_BLUE 2 +%define RGB_PIXELSIZE 3 +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2 +%include "jcgryss2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 0 +%define RGB_GREEN 1 +%define RGB_BLUE 2 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2 +%include "jcgryss2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 2 +%define RGB_GREEN 1 +%define RGB_BLUE 0 +%define RGB_PIXELSIZE 3 +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2 +%include "jcgryss2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 2 +%define RGB_GREEN 1 +%define RGB_BLUE 0 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2 +%include "jcgryss2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 3 +%define RGB_GREEN 2 +%define RGB_BLUE 1 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2 +%include "jcgryss2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 1 +%define RGB_GREEN 2 +%define RGB_BLUE 3 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2 +%include "jcgryss2-64.asm" diff --git a/simd/jcgrass2.asm b/simd/jcgrass2.asm new file mode 100644 index 0000000..f284e0f --- /dev/null +++ b/simd/jcgrass2.asm @@ -0,0 +1,110 @@ +; +; jcgrass2.asm - grayscale colorspace conversion (SSE2) +; +; x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; Copyright (C) 2011, D. R. Commander. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_rgb_gray_convert_sse2) + +EXTN(jconst_rgb_gray_convert_sse2): + +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) + + alignz 16 + +; -------------------------------------------------------------------------- +%include "jcgryss2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 0 +%define RGB_GREEN 1 +%define RGB_BLUE 2 +%define RGB_PIXELSIZE 3 +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2 +%include "jcgryss2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 0 +%define RGB_GREEN 1 +%define RGB_BLUE 2 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2 +%include "jcgryss2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 2 +%define RGB_GREEN 1 +%define RGB_BLUE 0 +%define RGB_PIXELSIZE 3 +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2 +%include "jcgryss2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 2 +%define RGB_GREEN 1 +%define RGB_BLUE 0 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2 +%include "jcgryss2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 3 +%define RGB_GREEN 2 +%define RGB_BLUE 1 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2 +%include "jcgryss2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 1 +%define RGB_GREEN 2 +%define RGB_BLUE 3 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2 +%include "jcgryss2.asm" diff --git a/simd/jcgrymmx.asm b/simd/jcgrymmx.asm new file mode 100644 index 0000000..93d0936 --- /dev/null +++ b/simd/jcgrymmx.asm @@ -0,0 +1,359 @@ +; +; jcclrmmx.asm - grayscale colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright 2011 D. R. Commander +; +; Based on +; x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_mmx (JDIMENSION img_width, +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, +; JDIMENSION output_row, int num_rows); +; + +%define img_width(b) (b)+8 ; JDIMENSION img_width +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf +%define output_row(b) (b)+20 ; JDIMENSION output_row +%define num_rows(b) (b)+24 ; int num_rows + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 + global EXTN(jsimd_rgb_gray_convert_mmx) + +EXTN(jsimd_rgb_gray_convert_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] ; num_cols + test ecx,ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 +.rowloop: + pushpic eax + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_MMWORD + jae short .columnloop + alignx 16,7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + xor eax,eax + mov al, BYTE [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + xor edx,edx + mov dx, WORD [esi+ecx] + shl eax, WORD_BIT + or eax,edx +.column_ld4: + movd mmA,eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd mmG, DWORD [esi+ecx] + psllq mmA, DWORD_BIT + por mmA,mmG +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + movq mmG,mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + mov ecx, SIZEOF_MMWORD + jmp short .rgb_gray_cnv +.column_ld16: + test cl, 2*SIZEOF_MMWORD + mov ecx, SIZEOF_MMWORD + jz short .rgb_gray_cnv + movq mmF,mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_gray_cnv + alignx 16,7 + +.columnloop: + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+2*SIZEOF_MMWORD] + +.rgb_gray_cnv: + ; mmA=(00 10 20 01 11 21 02 12) + ; mmG=(22 03 13 23 04 14 24 05) + ; mmF=(15 25 06 16 26 07 17 27) + + movq mmD,mmA + psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01) + psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --) + + punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05) + psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23) + + punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16) + punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27) + + movq mmE,mmA + psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14) + psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --) + + punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) + psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25) + + punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07) + punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27) + + pxor mmH,mmH + + movq mmC,mmA + punpcklbw mmA,mmH ; mmA=(00 02 04 06) + punpckhbw mmC,mmH ; mmC=(10 12 14 16) + + movq mmB,mmE + punpcklbw mmE,mmH ; mmE=(20 22 24 26) + punpckhbw mmB,mmH ; mmB=(01 03 05 07) + + movq mmF,mmD + punpcklbw mmD,mmH ; mmD=(11 13 15 17) + punpckhbw mmF,mmH ; mmF=(21 23 25 27) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_MMWORD/8 + jz short .column_ld2 + sub ecx, byte SIZEOF_MMWORD/8 + movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_MMWORD/4 + jz short .column_ld4 + sub ecx, byte SIZEOF_MMWORD/4 + movq mmF,mmA + movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld4: + test cl, SIZEOF_MMWORD/2 + mov ecx, SIZEOF_MMWORD + jz short .rgb_gray_cnv + movq mmD,mmA + movq mmC,mmF + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_gray_cnv + alignx 16,7 + +.columnloop: + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmD, MMWORD [esi+2*SIZEOF_MMWORD] + movq mmC, MMWORD [esi+3*SIZEOF_MMWORD] + +.rgb_gray_cnv: + ; mmA=(00 10 20 30 01 11 21 31) + ; mmF=(02 12 22 32 03 13 23 33) + ; mmD=(04 14 24 34 05 15 25 35) + ; mmC=(06 16 26 36 07 17 27 37) + + movq mmB,mmA + punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32) + punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33) + + movq mmG,mmD + punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36) + punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37) + + movq mmE,mmA + punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) + punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36) + + movq mmH,mmB + punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17) + punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37) + + pxor mmF,mmF + + movq mmC,mmA + punpcklbw mmA,mmF ; mmA=(00 02 04 06) + punpckhbw mmC,mmF ; mmC=(10 12 14 16) + + movq mmD,mmB + punpcklbw mmB,mmF ; mmB=(01 03 05 07) + punpckhbw mmD,mmF ; mmD=(11 13 15 17) + + movq mmG,mmE + punpcklbw mmE,mmF ; mmE=(20 22 24 26) + punpckhbw mmG,mmF ; mmG=(30 32 34 36) + + punpcklbw mmF,mmH + punpckhbw mmH,mmH + psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27) + psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE + ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movq mm6,mm1 + punpcklwd mm1,mm3 + punpckhwd mm6,mm3 + pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movq mm7, mm6 ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movq mm6,mm0 + punpcklwd mm0,mm2 + punpckhwd mm6,mm2 + pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movq MMWORD [wk(0)], mm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movq MMWORD [wk(1)], mm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movq mm0, mm5 ; mm0=BO + movq mm6, mm4 ; mm6=BE + + movq mm4,mm0 + punpcklwd mm0,mm3 + punpckhwd mm4,mm3 + pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF] + + paddd mm0, mm1 + paddd mm4, mm7 + paddd mm0,mm3 + paddd mm4,mm3 + psrld mm0,SCALEBITS ; mm0=YOL + psrld mm4,SCALEBITS ; mm4=YOH + packssdw mm0,mm4 ; mm0=YO + + movq mm4,mm6 + punpcklwd mm6,mm2 + punpckhwd mm4,mm2 + pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF] + + paddd mm6, MMWORD [wk(0)] + paddd mm4, MMWORD [wk(1)] + paddd mm6,mm2 + paddd mm4,mm2 + psrld mm6,SCALEBITS ; mm6=YEL + psrld mm4,SCALEBITS ; mm4=YEH + packssdw mm6,mm4 ; mm6=YE + + psllw mm0,BYTE_BIT + por mm6,mm0 ; mm6=Y + movq MMWORD [edi], mm6 ; Save Y + + sub ecx, byte SIZEOF_MMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr + add edi, byte SIZEOF_MMWORD ; outptr0 + cmp ecx, byte SIZEOF_MMWORD + jae near .columnloop + test ecx,ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/simd/jcgryss2-64.asm b/simd/jcgryss2-64.asm new file mode 100644 index 0000000..3a52ec2 --- /dev/null +++ b/simd/jcgryss2-64.asm @@ -0,0 +1,366 @@ +; +; jcgryss2-64.asm - grayscale colorspace conversion (64-bit SSE2) +; +; x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; Copyright (C) 2011, D. R. Commander. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width, +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, +; JDIMENSION output_row, int num_rows); +; + +; r10 = JDIMENSION img_width +; r11 = JSAMPARRAY input_buf +; r12 = JSAMPIMAGE output_buf +; r13 = JDIMENSION output_row +; r14 = int num_rows + +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + + global EXTN(jsimd_rgb_gray_convert_sse2) + +EXTN(jsimd_rgb_gray_convert_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov rcx, r10 + test rcx,rcx + jz near .return + + push rcx + + mov rsi, r12 + mov rcx, r13 + mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] + lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rsi, r11 + mov eax, r14d + test rax,rax + jle near .return +.rowloop: + push rdi + push rsi + push rcx ; col + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr0 + + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push rax + push rdx + lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub rcx, byte SIZEOF_BYTE + movzx rax, BYTE [rsi+rcx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub rcx, byte SIZEOF_WORD + movzx rdx, WORD [rsi+rcx] + shl rax, WORD_BIT + or rax,rdx +.column_ld4: + movd xmmA,eax + pop rdx + pop rax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub rcx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [rsi+rcx] + pslldq xmmA, SIZEOF_DWORD + por xmmA,xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub rcx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [rsi+rcx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF,xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + mov rcx, SIZEOF_XMMWORD + jmp short .rgb_gray_cnv +.column_ld32: + test cl, 2*SIZEOF_XMMWORD + mov rcx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmB,xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + +.columnloop: + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + movdqa xmmG,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + + punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + + movdqa xmmD,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + + punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + + movdqa xmmE,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + + punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + + pxor xmmH,xmmH + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmB,xmmE + punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + + movdqa xmmF,xmmD + punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub rcx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub rcx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmE +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub rcx, byte SIZEOF_XMMWORD/4 + movdqa xmmE,xmmA + movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + mov rcx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmF,xmmA + movdqa xmmH,xmmE + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + +.columnloop: + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC,xmmF + punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB,xmmA + punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG,xmmD + punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE,xmmA + punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH,xmmB + punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF,xmmF + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD,xmmB + punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG,xmmE + punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF,xmmH + punpckhbw xmmH,xmmH + psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movdqa xmm6,xmm1 + punpcklwd xmm1,xmm3 + punpckhwd xmm6,xmm3 + pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm6,xmm0 + punpcklwd xmm0,xmm2 + punpckhwd xmm6,xmm2 + pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa xmm0, xmm5 ; xmm0=BO + movdqa xmm6, xmm4 ; xmm6=BE + + movdqa xmm4,xmm0 + punpcklwd xmm0,xmm3 + punpckhwd xmm4,xmm3 + pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] + + paddd xmm0, xmm1 + paddd xmm4, xmm7 + paddd xmm0,xmm3 + paddd xmm4,xmm3 + psrld xmm0,SCALEBITS ; xmm0=YOL + psrld xmm4,SCALEBITS ; xmm4=YOH + packssdw xmm0,xmm4 ; xmm0=YO + + movdqa xmm4,xmm6 + punpcklwd xmm6,xmm2 + punpckhwd xmm4,xmm2 + pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(0)] + paddd xmm4, XMMWORD [wk(1)] + paddd xmm6,xmm2 + paddd xmm4,xmm2 + psrld xmm6,SCALEBITS ; xmm6=YEL + psrld xmm4,SCALEBITS ; xmm4=YEH + packssdw xmm6,xmm4 ; xmm6=YE + + psllw xmm0,BYTE_BIT + por xmm6,xmm0 ; xmm6=Y + movdqa XMMWORD [rdi], xmm6 ; Save Y + + sub rcx, byte SIZEOF_XMMWORD + add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add rdi, byte SIZEOF_XMMWORD ; outptr0 + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + test rcx,rcx + jnz near .column_ld1 + + pop rcx ; col + pop rsi + pop rdi + + add rsi, byte SIZEOF_JSAMPROW ; input_buf + add rdi, byte SIZEOF_JSAMPROW + dec rax ; num_rows + jg near .rowloop + +.return: + pop rbx + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/simd/jcgryss2.asm b/simd/jcgryss2.asm new file mode 100644 index 0000000..6eac030 --- /dev/null +++ b/simd/jcgryss2.asm @@ -0,0 +1,385 @@ +; +; jcgryss2.asm - grayscale colorspace conversion (SSE2) +; +; x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; Copyright (C) 2011, D. R. Commander. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width, +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, +; JDIMENSION output_row, int num_rows); +; + +%define img_width(b) (b)+8 ; JDIMENSION img_width +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf +%define output_row(b) (b)+20 ; JDIMENSION output_row +%define num_rows(b) (b)+24 ; int num_rows + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 + + global EXTN(jsimd_rgb_gray_convert_sse2) + +EXTN(jsimd_rgb_gray_convert_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] + test ecx,ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 +.rowloop: + pushpic eax + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + alignx 16,7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + movzx eax, BYTE [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + movzx edx, WORD [esi+ecx] + shl eax, WORD_BIT + or eax,edx +.column_ld4: + movd xmmA,eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [esi+ecx] + pslldq xmmA, SIZEOF_DWORD + por xmmA,xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub ecx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [esi+ecx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF,xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + mov ecx, SIZEOF_XMMWORD + jmp short .rgb_gray_cnv +.column_ld32: + test cl, 2*SIZEOF_XMMWORD + mov ecx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmB,xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + alignx 16,7 + +.columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + movdqa xmmG,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + + punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + + movdqa xmmD,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + + punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + + movdqa xmmE,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + + punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + + pxor xmmH,xmmH + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmB,xmmE + punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + + movdqa xmmF,xmmD + punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub ecx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub ecx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmE +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub ecx, byte SIZEOF_XMMWORD/4 + movdqa xmmE,xmmA + movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + mov ecx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmF,xmmA + movdqa xmmH,xmmE + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + alignx 16,7 + +.columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC,xmmF + punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB,xmmA + punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG,xmmD + punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE,xmmA + punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH,xmmB + punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF,xmmF + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD,xmmB + punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG,xmmE + punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF,xmmH + punpckhbw xmmH,xmmH + psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movdqa xmm6,xmm1 + punpcklwd xmm1,xmm3 + punpckhwd xmm6,xmm3 + pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm6,xmm0 + punpcklwd xmm0,xmm2 + punpckhwd xmm6,xmm2 + pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa xmm0, xmm5 ; xmm0=BO + movdqa xmm6, xmm4 ; xmm6=BE + + movdqa xmm4,xmm0 + punpcklwd xmm0,xmm3 + punpckhwd xmm4,xmm3 + pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] + + paddd xmm0, xmm1 + paddd xmm4, xmm7 + paddd xmm0,xmm3 + paddd xmm4,xmm3 + psrld xmm0,SCALEBITS ; xmm0=YOL + psrld xmm4,SCALEBITS ; xmm4=YOH + packssdw xmm0,xmm4 ; xmm0=YO + + movdqa xmm4,xmm6 + punpcklwd xmm6,xmm2 + punpckhwd xmm4,xmm2 + pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(0)] + paddd xmm4, XMMWORD [wk(1)] + paddd xmm6,xmm2 + paddd xmm4,xmm2 + psrld xmm6,SCALEBITS ; xmm6=YEL + psrld xmm4,SCALEBITS ; xmm4=YEH + packssdw xmm6,xmm4 ; xmm6=YE + + psllw xmm0,BYTE_BIT + por xmm6,xmm0 ; xmm6=Y + movdqa XMMWORD [edi], xmm6 ; Save Y + + sub ecx, byte SIZEOF_XMMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add edi, byte SIZEOF_XMMWORD ; outptr0 + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx,ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/simd/jsimd.h b/simd/jsimd.h index 89ac1b7..af3abbb 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -43,6 +43,14 @@ #define jsimd_extbgrx_ycc_convert_sse2 jSEXTBGRXYCCS2 #define jsimd_extxbgr_ycc_convert_sse2 jSEXTXBGRYCCS2 #define jsimd_extxrgb_ycc_convert_sse2 jSEXTXRGBYCCS2 +#define jconst_rgb_gray_convert_sse2 jSCRGBGRYS2 +#define jsimd_rgb_gray_convert_sse2 jSRGBGRYS2 +#define jsimd_extrgb_gray_convert_sse2 jSEXTRGBGRYS2 +#define jsimd_extrgbx_gray_convert_sse2 jSEXTRGBXGRYS2 +#define jsimd_extbgr_gray_convert_sse2 jSEXTBGRGRYS2 +#define jsimd_extbgrx_gray_convert_sse2 jSEXTBGRXGRYS2 +#define jsimd_extxbgr_gray_convert_sse2 jSEXTXBGRGRYS2 +#define jsimd_extxrgb_gray_convert_sse2 jSEXTXRGBGRYS2 #define jconst_ycc_rgb_convert_sse2 jSCYCCRGBS2 #define jsimd_ycc_rgb_convert_sse2 jSYCCRGBS2 #define jsimd_ycc_extrgb_convert_sse2 jSYCCEXTRGBS2 @@ -163,6 +171,35 @@ EXTERN(void) jsimd_extxrgb_ycc_convert_mmx JSAMPARRAY input_buf, JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_rgb_gray_convert_mmx + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extrgb_gray_convert_mmx + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extrgbx_gray_convert_mmx + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extbgr_gray_convert_mmx + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extbgrx_gray_convert_mmx + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extxbgr_gray_convert_mmx + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extxrgb_gray_convert_mmx + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); + EXTERN(void) jsimd_ycc_rgb_convert_mmx JPP((JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, @@ -222,6 +259,36 @@ EXTERN(void) jsimd_extxrgb_ycc_convert_sse2 JSAMPARRAY input_buf, JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)); +extern const int jconst_rgb_gray_convert_sse2[]; +EXTERN(void) jsimd_rgb_gray_convert_sse2 + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extrgb_gray_convert_sse2 + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extrgbx_gray_convert_sse2 + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extbgr_gray_convert_sse2 + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extbgrx_gray_convert_sse2 + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extxbgr_gray_convert_sse2 + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extxrgb_gray_convert_sse2 + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); + extern const int jconst_ycc_rgb_convert_sse2[]; EXTERN(void) jsimd_ycc_rgb_convert_sse2 JPP((JDIMENSION out_width, diff --git a/simd/jsimd_i386.c b/simd/jsimd_i386.c index d9bb774..357d2ce 100644 --- a/simd/jsimd_i386.c +++ b/simd/jsimd_i386.c @@ -84,6 +84,28 @@ jsimd_can_rgb_ycc (void) } GLOBAL(int) +jsimd_can_rgb_gray (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) jsimd_can_ycc_rgb (void) { init_simd(); @@ -155,6 +177,55 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo, } GLOBAL(void) +jsimd_rgb_gray_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) + { + case JCS_EXT_RGB: + sse2fct=jsimd_extrgb_gray_convert_sse2; + mmxfct=jsimd_extrgb_gray_convert_mmx; + break; + case JCS_EXT_RGBX: + sse2fct=jsimd_extrgbx_gray_convert_sse2; + mmxfct=jsimd_extrgbx_gray_convert_mmx; + break; + case JCS_EXT_BGR: + sse2fct=jsimd_extbgr_gray_convert_sse2; + mmxfct=jsimd_extbgr_gray_convert_mmx; + break; + case JCS_EXT_BGRX: + sse2fct=jsimd_extbgrx_gray_convert_sse2; + mmxfct=jsimd_extbgrx_gray_convert_mmx; + break; + case JCS_EXT_XBGR: + sse2fct=jsimd_extxbgr_gray_convert_sse2; + mmxfct=jsimd_extxbgr_gray_convert_mmx; + break; + case JCS_EXT_XRGB: + sse2fct=jsimd_extxrgb_gray_convert_sse2; + mmxfct=jsimd_extxrgb_gray_convert_mmx; + break; + default: + sse2fct=jsimd_rgb_gray_convert_sse2; + mmxfct=jsimd_rgb_gray_convert_mmx; + break; + } + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2)) + sse2fct(cinfo->image_width, input_buf, + output_buf, output_row, num_rows); + else if (simd_support & JSIMD_MMX) + mmxfct(cinfo->image_width, input_buf, + output_buf, output_row, num_rows); +} + +GLOBAL(void) jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows) diff --git a/simd/jsimd_x86_64.c b/simd/jsimd_x86_64.c index 7659249..c89fd12 100644 --- a/simd/jsimd_x86_64.c +++ b/simd/jsimd_x86_64.c @@ -47,6 +47,23 @@ jsimd_can_rgb_ycc (void) } GLOBAL(int) +jsimd_can_rgb_gray (void) +{ + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (!IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2)) + return 0; + + return 1; +} + +GLOBAL(int) jsimd_can_ycc_rgb (void) { /* The code is optimised for these values only */ @@ -99,6 +116,41 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo, } GLOBAL(void) +jsimd_rgb_gray_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) + { + case JCS_EXT_RGB: + sse2fct=jsimd_extrgb_gray_convert_sse2; + break; + case JCS_EXT_RGBX: + sse2fct=jsimd_extrgbx_gray_convert_sse2; + break; + case JCS_EXT_BGR: + sse2fct=jsimd_extbgr_gray_convert_sse2; + break; + case JCS_EXT_BGRX: + sse2fct=jsimd_extbgrx_gray_convert_sse2; + break; + case JCS_EXT_XBGR: + sse2fct=jsimd_extxbgr_gray_convert_sse2; + break; + case JCS_EXT_XRGB: + sse2fct=jsimd_extxrgb_gray_convert_sse2; + break; + default: + sse2fct=jsimd_rgb_gray_convert_sse2; + break; + } + + sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows) |