aboutsummaryrefslogtreecommitdiff
path: root/simd
diff options
context:
space:
mode:
authordcommander <dcommander@632fc199-4ca6-4c93-a231-07263d6284db>2009-04-03 12:00:51 +0000
committerdcommander <dcommander@632fc199-4ca6-4c93-a231-07263d6284db>2009-04-03 12:00:51 +0000
commit1a9967cd2bec4abc039aac111bfa61ef37266fe1 (patch)
tree4cf0d62b0ee4a8f5656fdb7ded0e985da403f764 /simd
parent803e3a287a29ba05e60a74b77f8cb41017164bd1 (diff)
Implement new colorspaces to allow directly compressing from/decompressing to RGB/RGBX/BGR/BGRX/XBGR/XRGB without conversion
git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@35 632fc199-4ca6-4c93-a231-07263d6284db
Diffstat (limited to 'simd')
-rw-r--r--simd/Makefile.am5
-rw-r--r--simd/jcclrmmx.asm474
-rw-r--r--simd/jcclrss2.asm500
-rw-r--r--simd/jccolmmx.asm523
-rw-r--r--simd/jccolss2.asm551
-rw-r--r--simd/jdclrmmx.asm402
-rw-r--r--simd/jdclrss2.asm500
-rw-r--r--simd/jdcolmmx.asm451
-rw-r--r--simd/jdcolss2.asm549
-rw-r--r--simd/jsimd.h122
10 files changed, 2275 insertions, 1802 deletions
diff --git a/simd/Makefile.am b/simd/Makefile.am
index ea53fee..2282804 100644
--- a/simd/Makefile.am
+++ b/simd/Makefile.am
@@ -19,6 +19,11 @@ libsimd_la_SOURCES = jsimd.h jsimdcfg.inc.h \
jiss2red.asm jiss2int.asm jiss2fst.asm \
jcqnts2f.asm jiss2flt.asm
+jccolmmx.lo: jcclrmmx.asm
+jccolss2.lo: jcclrss2.asm
+jdcolmmx.lo: jdclrmmx.asm
+jdcolss2.lo: jdclrss2.asm
+
.asm.lo:
$(LIBTOOL) --mode=compile --tag NASM ./nasm_lt.sh $(NASM) $(NAFLAGS) $< -o $@
diff --git a/simd/jcclrmmx.asm b/simd/jcclrmmx.asm
new file mode 100644
index 0000000..9cbae72
--- /dev/null
+++ b/simd/jcclrmmx.asm
@@ -0,0 +1,474 @@
+;
+; jcclrmmx.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width,
+; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+; JDIMENSION output_row, int num_rows);
+;
+
+%define img_width(b) (b)+8 ; JDIMENSION img_width
+%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
+%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
+%define output_row(b) (b)+20 ; JDIMENSION output_row
+%define num_rows(b) (b)+24 ; int num_rows
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 8
+%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
+
+ align 16
+ global EXTN(jsimd_rgb_ycc_convert_mmx)
+
+EXTN(jsimd_rgb_ycc_convert_mmx):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [img_width(eax)] ; num_cols
+ test ecx,ecx
+ jz near .return
+
+ push ecx
+
+ mov esi, JSAMPIMAGE [output_buf(eax)]
+ mov ecx, JDIMENSION [output_row(eax)]
+ mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+ lea edi, [edi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov esi, JSAMPARRAY [input_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax,eax
+ jle near .return
+ alignx 16,7
+.rowloop:
+ pushpic eax
+ push edx
+ push ebx
+ push edi
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr0
+ mov ebx, JSAMPROW [ebx] ; outptr1
+ mov edx, JSAMPROW [edx] ; outptr2
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ cmp ecx, byte SIZEOF_MMWORD
+ jae short .columnloop
+ alignx 16,7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push eax
+ push edx
+ lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_BYTE
+ xor eax,eax
+ mov al, BYTE [esi+ecx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_WORD
+ xor edx,edx
+ mov dx, WORD [esi+ecx]
+ shl eax, WORD_BIT
+ or eax,edx
+.column_ld4:
+ movd mmA,eax
+ pop edx
+ pop eax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_DWORD
+ movd mmG, DWORD [esi+ecx]
+ psllq mmA, DWORD_BIT
+ por mmA,mmG
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ movq mmG,mmA
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ mov ecx, SIZEOF_MMWORD
+ jmp short .rgb_ycc_cnv
+.column_ld16:
+ test cl, 2*SIZEOF_MMWORD
+ mov ecx, SIZEOF_MMWORD
+ jz short .rgb_ycc_cnv
+ movq mmF,mmA
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16,7
+
+.columnloop:
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+ movq mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+ ; mmA=(00 10 20 01 11 21 02 12)
+ ; mmG=(22 03 13 23 04 14 24 05)
+ ; mmF=(15 25 06 16 26 07 17 27)
+
+ movq mmD,mmA
+ psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01)
+ psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --)
+
+ punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05)
+ psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23)
+
+ punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16)
+ punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27)
+
+ movq mmE,mmA
+ psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14)
+ psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --)
+
+ punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16)
+ psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25)
+
+ punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07)
+ punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27)
+
+ pxor mmH,mmH
+
+ movq mmC,mmA
+ punpcklbw mmA,mmH ; mmA=(00 02 04 06)
+ punpckhbw mmC,mmH ; mmC=(10 12 14 16)
+
+ movq mmB,mmE
+ punpcklbw mmE,mmH ; mmE=(20 22 24 26)
+ punpckhbw mmB,mmH ; mmB=(01 03 05 07)
+
+ movq mmF,mmD
+ punpcklbw mmD,mmH ; mmD=(11 13 15 17)
+ punpckhbw mmF,mmH ; mmF=(21 23 25 27)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_MMWORD/8
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_MMWORD/8
+ movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_MMWORD/4
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_MMWORD/4
+ movq mmF,mmA
+ movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+ test cl, SIZEOF_MMWORD/2
+ mov ecx, SIZEOF_MMWORD
+ jz short .rgb_ycc_cnv
+ movq mmD,mmA
+ movq mmC,mmF
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16,7
+
+.columnloop:
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+ movq mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+ movq mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+ ; mmA=(00 10 20 30 01 11 21 31)
+ ; mmF=(02 12 22 32 03 13 23 33)
+ ; mmD=(04 14 24 34 05 15 25 35)
+ ; mmC=(06 16 26 36 07 17 27 37)
+
+ movq mmB,mmA
+ punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32)
+ punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33)
+
+ movq mmG,mmD
+ punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36)
+ punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37)
+
+ movq mmE,mmA
+ punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16)
+ punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36)
+
+ movq mmH,mmB
+ punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17)
+ punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37)
+
+ pxor mmF,mmF
+
+ movq mmC,mmA
+ punpcklbw mmA,mmF ; mmA=(00 02 04 06)
+ punpckhbw mmC,mmF ; mmC=(10 12 14 16)
+
+ movq mmD,mmB
+ punpcklbw mmB,mmF ; mmB=(01 03 05 07)
+ punpckhbw mmD,mmF ; mmD=(11 13 15 17)
+
+ movq mmG,mmE
+ punpcklbw mmE,mmF ; mmE=(20 22 24 26)
+ punpckhbw mmG,mmF ; mmG=(30 32 34 36)
+
+ punpcklbw mmF,mmH
+ punpckhbw mmH,mmH
+ psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27)
+ psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+ ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+ movq MMWORD [wk(0)], mm0 ; wk(0)=RE
+ movq MMWORD [wk(1)], mm1 ; wk(1)=RO
+ movq MMWORD [wk(2)], mm4 ; wk(2)=BE
+ movq MMWORD [wk(3)], mm5 ; wk(3)=BO
+
+ movq mm6,mm1
+ punpcklwd mm1,mm3
+ punpckhwd mm6,mm3
+ movq mm7,mm1
+ movq mm4,mm6
+ pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+ pmaddwd mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+ pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+ movq MMWORD [wk(4)], mm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+ movq MMWORD [wk(5)], mm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ pxor mm1,mm1
+ pxor mm6,mm6
+ punpcklwd mm1,mm5 ; mm1=BOL
+ punpckhwd mm6,mm5 ; mm6=BOH
+ psrld mm1,1 ; mm1=BOL*FIX(0.500)
+ psrld mm6,1 ; mm6=BOH*FIX(0.500)
+
+ movq mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
+
+ paddd mm7,mm1
+ paddd mm4,mm6
+ paddd mm7,mm5
+ paddd mm4,mm5
+ psrld mm7,SCALEBITS ; mm7=CbOL
+ psrld mm4,SCALEBITS ; mm4=CbOH
+ packssdw mm7,mm4 ; mm7=CbO
+
+ movq mm1, MMWORD [wk(2)] ; mm1=BE
+
+ movq mm6,mm0
+ punpcklwd mm0,mm2
+ punpckhwd mm6,mm2
+ movq mm5,mm0
+ movq mm4,mm6
+ pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+ pmaddwd mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+ pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+ movq MMWORD [wk(6)], mm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+ movq MMWORD [wk(7)], mm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ pxor mm0,mm0
+ pxor mm6,mm6
+ punpcklwd mm0,mm1 ; mm0=BEL
+ punpckhwd mm6,mm1 ; mm6=BEH
+ psrld mm0,1 ; mm0=BEL*FIX(0.500)
+ psrld mm6,1 ; mm6=BEH*FIX(0.500)
+
+ movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+ paddd mm5,mm0
+ paddd mm4,mm6
+ paddd mm5,mm1
+ paddd mm4,mm1
+ psrld mm5,SCALEBITS ; mm5=CbEL
+ psrld mm4,SCALEBITS ; mm4=CbEH
+ packssdw mm5,mm4 ; mm5=CbE
+
+ psllw mm7,BYTE_BIT
+ por mm5,mm7 ; mm5=Cb
+ movq MMWORD [ebx], mm5 ; Save Cb
+
+ movq mm0, MMWORD [wk(3)] ; mm0=BO
+ movq mm6, MMWORD [wk(2)] ; mm6=BE
+ movq mm1, MMWORD [wk(1)] ; mm1=RO
+
+ movq mm4,mm0
+ punpcklwd mm0,mm3
+ punpckhwd mm4,mm3
+ movq mm7,mm0
+ movq mm5,mm4
+ pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+ pmaddwd mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+ pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+ movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF]
+
+ paddd mm0, MMWORD [wk(4)]
+ paddd mm4, MMWORD [wk(5)]
+ paddd mm0,mm3
+ paddd mm4,mm3
+ psrld mm0,SCALEBITS ; mm0=YOL
+ psrld mm4,SCALEBITS ; mm4=YOH
+ packssdw mm0,mm4 ; mm0=YO
+
+ pxor mm3,mm3
+ pxor mm4,mm4
+ punpcklwd mm3,mm1 ; mm3=ROL
+ punpckhwd mm4,mm1 ; mm4=ROH
+ psrld mm3,1 ; mm3=ROL*FIX(0.500)
+ psrld mm4,1 ; mm4=ROH*FIX(0.500)
+
+ movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+ paddd mm7,mm3
+ paddd mm5,mm4
+ paddd mm7,mm1
+ paddd mm5,mm1
+ psrld mm7,SCALEBITS ; mm7=CrOL
+ psrld mm5,SCALEBITS ; mm5=CrOH
+ packssdw mm7,mm5 ; mm7=CrO
+
+ movq mm3, MMWORD [wk(0)] ; mm3=RE
+
+ movq mm4,mm6
+ punpcklwd mm6,mm2
+ punpckhwd mm4,mm2
+ movq mm1,mm6
+ movq mm5,mm4
+ pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+ pmaddwd mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+ pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+ movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF]
+
+ paddd mm6, MMWORD [wk(6)]
+ paddd mm4, MMWORD [wk(7)]
+ paddd mm6,mm2
+ paddd mm4,mm2
+ psrld mm6,SCALEBITS ; mm6=YEL
+ psrld mm4,SCALEBITS ; mm4=YEH
+ packssdw mm6,mm4 ; mm6=YE
+
+ psllw mm0,BYTE_BIT
+ por mm6,mm0 ; mm6=Y
+ movq MMWORD [edi], mm6 ; Save Y
+
+ pxor mm2,mm2
+ pxor mm4,mm4
+ punpcklwd mm2,mm3 ; mm2=REL
+ punpckhwd mm4,mm3 ; mm4=REH
+ psrld mm2,1 ; mm2=REL*FIX(0.500)
+ psrld mm4,1 ; mm4=REH*FIX(0.500)
+
+ movq mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
+
+ paddd mm1,mm2
+ paddd mm5,mm4
+ paddd mm1,mm0
+ paddd mm5,mm0
+ psrld mm1,SCALEBITS ; mm1=CrEL
+ psrld mm5,SCALEBITS ; mm5=CrEH
+ packssdw mm1,mm5 ; mm1=CrE
+
+ psllw mm7,BYTE_BIT
+ por mm1,mm7 ; mm1=Cr
+ movq MMWORD [edx], mm1 ; Save Cr
+
+ sub ecx, byte SIZEOF_MMWORD
+ add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr
+ add edi, byte SIZEOF_MMWORD ; outptr0
+ add ebx, byte SIZEOF_MMWORD ; outptr1
+ add edx, byte SIZEOF_MMWORD ; outptr2
+ cmp ecx, byte SIZEOF_MMWORD
+ jae near .columnloop
+ test ecx,ecx
+ jnz near .column_ld1
+
+ pop ecx ; col
+ pop esi
+ pop edi
+ pop ebx
+ pop edx
+ poppic eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_buf
+ add edi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ dec eax ; num_rows
+ jg near .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
diff --git a/simd/jcclrss2.asm b/simd/jcclrss2.asm
new file mode 100644
index 0000000..10b11f9
--- /dev/null
+++ b/simd/jcclrss2.asm
@@ -0,0 +1,500 @@
+;
+; jcclrss2.asm - colorspace conversion (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
+; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+; JDIMENSION output_row, int num_rows);
+;
+
+%define img_width(b) (b)+8 ; JDIMENSION img_width
+%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
+%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
+%define output_row(b) (b)+20 ; JDIMENSION output_row
+%define num_rows(b) (b)+24 ; int num_rows
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 8
+%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
+
+ align 16
+
+ global EXTN(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [img_width(eax)]
+ test ecx,ecx
+ jz near .return
+
+ push ecx
+
+ mov esi, JSAMPIMAGE [output_buf(eax)]
+ mov ecx, JDIMENSION [output_row(eax)]
+ mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+ lea edi, [edi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov esi, JSAMPARRAY [input_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax,eax
+ jle near .return
+ alignx 16,7
+.rowloop:
+ pushpic eax
+ push edx
+ push ebx
+ push edi
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr0
+ mov ebx, JSAMPROW [ebx] ; outptr1
+ mov edx, JSAMPROW [edx] ; outptr2
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ alignx 16,7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push eax
+ push edx
+ lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_BYTE
+ movzx eax, BYTE [esi+ecx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_WORD
+ movzx edx, WORD [esi+ecx]
+ shl eax, WORD_BIT
+ or eax,edx
+.column_ld4:
+ movd xmmA,eax
+ pop edx
+ pop eax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_DWORD
+ movd xmmF, XMM_DWORD [esi+ecx]
+ pslldq xmmA, SIZEOF_DWORD
+ por xmmA,xmmF
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub ecx, byte SIZEOF_MMWORD
+ movq xmmB, XMM_MMWORD [esi+ecx]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA,xmmB
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ movdqa xmmF,xmmA
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ mov ecx, SIZEOF_XMMWORD
+ jmp short .rgb_ycc_cnv
+.column_ld32:
+ test cl, 2*SIZEOF_XMMWORD
+ mov ecx, SIZEOF_XMMWORD
+ jz short .rgb_ycc_cnv
+ movdqa xmmB,xmmA
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16,7
+
+.columnloop:
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+ ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ movdqa xmmG,xmmA
+ pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+ psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+ pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+ punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+ punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+ movdqa xmmD,xmmA
+ pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+ psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+ pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+ punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+ punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+ movdqa xmmE,xmmA
+ pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+ psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+ punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+ pxor xmmH,xmmH
+
+ movdqa xmmC,xmmA
+ punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmB,xmmE
+ punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+ movdqa xmmF,xmmD
+ punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_XMMWORD/16
+ movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_XMMWORD/8
+ movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA,xmmE
+.column_ld4:
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_XMMWORD/4
+ movdqa xmmE,xmmA
+ movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+ test cl, SIZEOF_XMMWORD/2
+ mov ecx, SIZEOF_XMMWORD
+ jz short .rgb_ycc_cnv
+ movdqa xmmF,xmmA
+ movdqa xmmH,xmmE
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16,7
+
+.columnloop:
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+ movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+ ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ movdqa xmmD,xmmA
+ punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+ punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+ movdqa xmmC,xmmF
+ punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+ punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+ movdqa xmmB,xmmA
+ punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+ punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+ movdqa xmmG,xmmD
+ punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+ punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+ movdqa xmmE,xmmA
+ punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+ movdqa xmmH,xmmB
+ punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+ pxor xmmF,xmmF
+
+ movdqa xmmC,xmmA
+ punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmD,xmmB
+ punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+ movdqa xmmG,xmmE
+ punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+ punpcklbw xmmF,xmmH
+ punpckhbw xmmH,xmmH
+ psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
+ psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+ ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
+ movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
+ movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
+
+ movdqa xmm6,xmm1
+ punpcklwd xmm1,xmm3
+ punpckhwd xmm6,xmm3
+ movdqa xmm7,xmm1
+ movdqa xmm4,xmm6
+ pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+ pmaddwd xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+ pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+ movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+ movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ pxor xmm1,xmm1
+ pxor xmm6,xmm6
+ punpcklwd xmm1,xmm5 ; xmm1=BOL
+ punpckhwd xmm6,xmm5 ; xmm6=BOH
+ psrld xmm1,1 ; xmm1=BOL*FIX(0.500)
+ psrld xmm6,1 ; xmm6=BOH*FIX(0.500)
+
+ movdqa xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
+
+ paddd xmm7,xmm1
+ paddd xmm4,xmm6
+ paddd xmm7,xmm5
+ paddd xmm4,xmm5
+ psrld xmm7,SCALEBITS ; xmm7=CbOL
+ psrld xmm4,SCALEBITS ; xmm4=CbOH
+ packssdw xmm7,xmm4 ; xmm7=CbO
+
+ movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
+
+ movdqa xmm6,xmm0
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm6,xmm2
+ movdqa xmm5,xmm0
+ movdqa xmm4,xmm6
+ pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+ pmaddwd xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+ pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+ movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+ movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ pxor xmm0,xmm0
+ pxor xmm6,xmm6
+ punpcklwd xmm0,xmm1 ; xmm0=BEL
+ punpckhwd xmm6,xmm1 ; xmm6=BEH
+ psrld xmm0,1 ; xmm0=BEL*FIX(0.500)
+ psrld xmm6,1 ; xmm6=BEH*FIX(0.500)
+
+ movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+ paddd xmm5,xmm0
+ paddd xmm4,xmm6
+ paddd xmm5,xmm1
+ paddd xmm4,xmm1
+ psrld xmm5,SCALEBITS ; xmm5=CbEL
+ psrld xmm4,SCALEBITS ; xmm4=CbEH
+ packssdw xmm5,xmm4 ; xmm5=CbE
+
+ psllw xmm7,BYTE_BIT
+ por xmm5,xmm7 ; xmm5=Cb
+ movdqa XMMWORD [ebx], xmm5 ; Save Cb
+
+ movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
+ movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
+
+ movdqa xmm4,xmm0
+ punpcklwd xmm0,xmm3
+ punpckhwd xmm4,xmm3
+ movdqa xmm7,xmm0
+ movdqa xmm5,xmm4
+ pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+ pmaddwd xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+ pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+ movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
+
+ paddd xmm0, XMMWORD [wk(4)]
+ paddd xmm4, XMMWORD [wk(5)]
+ paddd xmm0,xmm3
+ paddd xmm4,xmm3
+ psrld xmm0,SCALEBITS ; xmm0=YOL
+ psrld xmm4,SCALEBITS ; xmm4=YOH
+ packssdw xmm0,xmm4 ; xmm0=YO
+
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ punpcklwd xmm3,xmm1 ; xmm3=ROL
+ punpckhwd xmm4,xmm1 ; xmm4=ROH
+ psrld xmm3,1 ; xmm3=ROL*FIX(0.500)
+ psrld xmm4,1 ; xmm4=ROH*FIX(0.500)
+
+ movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+ paddd xmm7,xmm3
+ paddd xmm5,xmm4
+ paddd xmm7,xmm1
+ paddd xmm5,xmm1
+ psrld xmm7,SCALEBITS ; xmm7=CrOL
+ psrld xmm5,SCALEBITS ; xmm5=CrOH
+ packssdw xmm7,xmm5 ; xmm7=CrO
+
+ movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
+
+ movdqa xmm4,xmm6
+ punpcklwd xmm6,xmm2
+ punpckhwd xmm4,xmm2
+ movdqa xmm1,xmm6
+ movdqa xmm5,xmm4
+ pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+ pmaddwd xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+ pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+ movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
+
+ paddd xmm6, XMMWORD [wk(6)]
+ paddd xmm4, XMMWORD [wk(7)]
+ paddd xmm6,xmm2
+ paddd xmm4,xmm2
+ psrld xmm6,SCALEBITS ; xmm6=YEL
+ psrld xmm4,SCALEBITS ; xmm4=YEH
+ packssdw xmm6,xmm4 ; xmm6=YE
+
+ psllw xmm0,BYTE_BIT
+ por xmm6,xmm0 ; xmm6=Y
+ movdqa XMMWORD [edi], xmm6 ; Save Y
+
+ pxor xmm2,xmm2
+ pxor xmm4,xmm4
+ punpcklwd xmm2,xmm3 ; xmm2=REL
+ punpckhwd xmm4,xmm3 ; xmm4=REH
+ psrld xmm2,1 ; xmm2=REL*FIX(0.500)
+ psrld xmm4,1 ; xmm4=REH*FIX(0.500)
+
+ movdqa xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
+
+ paddd xmm1,xmm2
+ paddd xmm5,xmm4
+ paddd xmm1,xmm0
+ paddd xmm5,xmm0
+ psrld xmm1,SCALEBITS ; xmm1=CrEL
+ psrld xmm5,SCALEBITS ; xmm5=CrEH
+ packssdw xmm1,xmm5 ; xmm1=CrE
+
+ psllw xmm7,BYTE_BIT
+ por xmm1,xmm7 ; xmm1=Cr
+ movdqa XMMWORD [edx], xmm1 ; Save Cr
+
+ sub ecx, byte SIZEOF_XMMWORD
+ add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
+ add edi, byte SIZEOF_XMMWORD ; outptr0
+ add ebx, byte SIZEOF_XMMWORD ; outptr1
+ add edx, byte SIZEOF_XMMWORD ; outptr2
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test ecx,ecx
+ jnz near .column_ld1
+
+ pop ecx ; col
+ pop esi
+ pop edi
+ pop ebx
+ pop edx
+ poppic eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_buf
+ add edi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ dec eax ; num_rows
+ jg near .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
diff --git a/simd/jccolmmx.asm b/simd/jccolmmx.asm
index 340af00..8352fc3 100644
--- a/simd/jccolmmx.asm
+++ b/simd/jccolmmx.asm
@@ -2,6 +2,7 @@
; jccolmmx.asm - colorspace conversion (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
;
; Based on
; x86 SIMD extension for IJG JPEG library
@@ -51,458 +52,70 @@ PD_ONEHALF times 2 dd (1 << (SCALEBITS-1))
alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width,
-; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-; JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b) (b)+8 ; JDIMENSION img_width
-%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
-%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
-%define output_row(b) (b)+20 ; JDIMENSION output_row
-%define num_rows(b) (b)+24 ; int num_rows
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
-%define WK_NUM 8
-%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
-
- align 16
- global EXTN(jsimd_rgb_ycc_convert_mmx)
-
-EXTN(jsimd_rgb_ycc_convert_mmx):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic eax ; make a room for GOT address
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
- movpic POINTER [gotptr], ebx ; save GOT address
-
- mov ecx, JDIMENSION [img_width(eax)] ; num_cols
- test ecx,ecx
- jz near .return
-
- push ecx
-
- mov esi, JSAMPIMAGE [output_buf(eax)]
- mov ecx, JDIMENSION [output_row(eax)]
- mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
- lea edi, [edi+ecx*SIZEOF_JSAMPROW]
- lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
- lea edx, [edx+ecx*SIZEOF_JSAMPROW]
-
- pop ecx
-
- mov esi, JSAMPARRAY [input_buf(eax)]
- mov eax, INT [num_rows(eax)]
- test eax,eax
- jle near .return
- alignx 16,7
-.rowloop:
- pushpic eax
- push edx
- push ebx
- push edi
- push esi
- push ecx ; col
-
- mov esi, JSAMPROW [esi] ; inptr
- mov edi, JSAMPROW [edi] ; outptr0
- mov ebx, JSAMPROW [ebx] ; outptr1
- mov edx, JSAMPROW [edx] ; outptr2
- movpic eax, POINTER [gotptr] ; load GOT address (eax)
-
- cmp ecx, byte SIZEOF_MMWORD
- jae short .columnloop
- alignx 16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
- push eax
- push edx
- lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
- test cl, SIZEOF_BYTE
- jz short .column_ld2
- sub ecx, byte SIZEOF_BYTE
- xor eax,eax
- mov al, BYTE [esi+ecx]
-.column_ld2:
- test cl, SIZEOF_WORD
- jz short .column_ld4
- sub ecx, byte SIZEOF_WORD
- xor edx,edx
- mov dx, WORD [esi+ecx]
- shl eax, WORD_BIT
- or eax,edx
-.column_ld4:
- movd mmA,eax
- pop edx
- pop eax
- test cl, SIZEOF_DWORD
- jz short .column_ld8
- sub ecx, byte SIZEOF_DWORD
- movd mmG, DWORD [esi+ecx]
- psllq mmA, DWORD_BIT
- por mmA,mmG
-.column_ld8:
- test cl, SIZEOF_MMWORD
- jz short .column_ld16
- movq mmG,mmA
- movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
- mov ecx, SIZEOF_MMWORD
- jmp short .rgb_ycc_cnv
-.column_ld16:
- test cl, 2*SIZEOF_MMWORD
- mov ecx, SIZEOF_MMWORD
- jz short .rgb_ycc_cnv
- movq mmF,mmA
- movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
- movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
- jmp short .rgb_ycc_cnv
- alignx 16,7
-
-.columnloop:
- movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
- movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
- movq mmF, MMWORD [esi+2*SIZEOF_MMWORD]
-
-.rgb_ycc_cnv:
- ; mmA=(00 10 20 01 11 21 02 12)
- ; mmG=(22 03 13 23 04 14 24 05)
- ; mmF=(15 25 06 16 26 07 17 27)
-
- movq mmD,mmA
- psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01)
- psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --)
-
- punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05)
- psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23)
-
- punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16)
- punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27)
-
- movq mmE,mmA
- psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14)
- psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --)
-
- punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16)
- psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25)
-
- punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07)
- punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27)
-
- pxor mmH,mmH
-
- movq mmC,mmA
- punpcklbw mmA,mmH ; mmA=(00 02 04 06)
- punpckhbw mmC,mmH ; mmC=(10 12 14 16)
-
- movq mmB,mmE
- punpcklbw mmE,mmH ; mmE=(20 22 24 26)
- punpckhbw mmB,mmH ; mmB=(01 03 05 07)
-
- movq mmF,mmD
- punpcklbw mmD,mmH ; mmD=(11 13 15 17)
- punpckhbw mmF,mmH ; mmF=(21 23 25 27)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
- test cl, SIZEOF_MMWORD/8
- jz short .column_ld2
- sub ecx, byte SIZEOF_MMWORD/8
- movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
- test cl, SIZEOF_MMWORD/4
- jz short .column_ld4
- sub ecx, byte SIZEOF_MMWORD/4
- movq mmF,mmA
- movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld4:
- test cl, SIZEOF_MMWORD/2
- mov ecx, SIZEOF_MMWORD
- jz short .rgb_ycc_cnv
- movq mmD,mmA
- movq mmC,mmF
- movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
- movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
- jmp short .rgb_ycc_cnv
- alignx 16,7
-
-.columnloop:
- movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
- movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
- movq mmD, MMWORD [esi+2*SIZEOF_MMWORD]
- movq mmC, MMWORD [esi+3*SIZEOF_MMWORD]
-
-.rgb_ycc_cnv:
- ; mmA=(00 10 20 30 01 11 21 31)
- ; mmF=(02 12 22 32 03 13 23 33)
- ; mmD=(04 14 24 34 05 15 25 35)
- ; mmC=(06 16 26 36 07 17 27 37)
-
- movq mmB,mmA
- punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32)
- punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33)
-
- movq mmG,mmD
- punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36)
- punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37)
-
- movq mmE,mmA
- punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16)
- punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36)
-
- movq mmH,mmB
- punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17)
- punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37)
-
- pxor mmF,mmF
-
- movq mmC,mmA
- punpcklbw mmA,mmF ; mmA=(00 02 04 06)
- punpckhbw mmC,mmF ; mmC=(10 12 14 16)
-
- movq mmD,mmB
- punpcklbw mmB,mmF ; mmB=(01 03 05 07)
- punpckhbw mmD,mmF ; mmD=(11 13 15 17)
-
- movq mmG,mmE
- punpcklbw mmE,mmF ; mmE=(20 22 24 26)
- punpckhbw mmG,mmF ; mmG=(30 32 34 36)
-
- punpcklbw mmF,mmH
- punpckhbw mmH,mmH
- psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27)
- psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
- ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
- ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
-
- ; (Original)
- ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
- ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
- ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
- ;
- ; (This implementation)
- ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
- ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
- ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
- movq MMWORD [wk(0)], mm0 ; wk(0)=RE
- movq MMWORD [wk(1)], mm1 ; wk(1)=RO
- movq MMWORD [wk(2)], mm4 ; wk(2)=BE
- movq MMWORD [wk(3)], mm5 ; wk(3)=BO
-
- movq mm6,mm1
- punpcklwd mm1,mm3
- punpckhwd mm6,mm3
- movq mm7,mm1
- movq mm4,mm6
- pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
- pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
- pmaddwd mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
- pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
- movq MMWORD [wk(4)], mm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
- movq MMWORD [wk(5)], mm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
- pxor mm1,mm1
- pxor mm6,mm6
- punpcklwd mm1,mm5 ; mm1=BOL
- punpckhwd mm6,mm5 ; mm6=BOH
- psrld mm1,1 ; mm1=BOL*FIX(0.500)
- psrld mm6,1 ; mm6=BOH*FIX(0.500)
-
- movq mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
-
- paddd mm7,mm1
- paddd mm4,mm6
- paddd mm7,mm5
- paddd mm4,mm5
- psrld mm7,SCALEBITS ; mm7=CbOL
- psrld mm4,SCALEBITS ; mm4=CbOH
- packssdw mm7,mm4 ; mm7=CbO
-
- movq mm1, MMWORD [wk(2)] ; mm1=BE
-
- movq mm6,mm0
- punpcklwd mm0,mm2
- punpckhwd mm6,mm2
- movq mm5,mm0
- movq mm4,mm6
- pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
- pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
- pmaddwd mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
- pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
- movq MMWORD [wk(6)], mm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
- movq MMWORD [wk(7)], mm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
- pxor mm0,mm0
- pxor mm6,mm6
- punpcklwd mm0,mm1 ; mm0=BEL
- punpckhwd mm6,mm1 ; mm6=BEH
- psrld mm0,1 ; mm0=BEL*FIX(0.500)
- psrld mm6,1 ; mm6=BEH*FIX(0.500)
-
- movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
-
- paddd mm5,mm0
- paddd mm4,mm6
- paddd mm5,mm1
- paddd mm4,mm1
- psrld mm5,SCALEBITS ; mm5=CbEL
- psrld mm4,SCALEBITS ; mm4=CbEH
- packssdw mm5,mm4 ; mm5=CbE
-
- psllw mm7,BYTE_BIT
- por mm5,mm7 ; mm5=Cb
- movq MMWORD [ebx], mm5 ; Save Cb
-
- movq mm0, MMWORD [wk(3)] ; mm0=BO
- movq mm6, MMWORD [wk(2)] ; mm6=BE
- movq mm1, MMWORD [wk(1)] ; mm1=RO
-
- movq mm4,mm0
- punpcklwd mm0,mm3
- punpckhwd mm4,mm3
- movq mm7,mm0
- movq mm5,mm4
- pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
- pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
- pmaddwd mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
- pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
- movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF]
-
- paddd mm0, MMWORD [wk(4)]
- paddd mm4, MMWORD [wk(5)]
- paddd mm0,mm3
- paddd mm4,mm3
- psrld mm0,SCALEBITS ; mm0=YOL
- psrld mm4,SCALEBITS ; mm4=YOH
- packssdw mm0,mm4 ; mm0=YO
-
- pxor mm3,mm3
- pxor mm4,mm4
- punpcklwd mm3,mm1 ; mm3=ROL
- punpckhwd mm4,mm1 ; mm4=ROH
- psrld mm3,1 ; mm3=ROL*FIX(0.500)
- psrld mm4,1 ; mm4=ROH*FIX(0.500)
-
- movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
-
- paddd mm7,mm3
- paddd mm5,mm4
- paddd mm7,mm1
- paddd mm5,mm1
- psrld mm7,SCALEBITS ; mm7=CrOL
- psrld mm5,SCALEBITS ; mm5=CrOH
- packssdw mm7,mm5 ; mm7=CrO
-
- movq mm3, MMWORD [wk(0)] ; mm3=RE
-
- movq mm4,mm6
- punpcklwd mm6,mm2
- punpckhwd mm4,mm2
- movq mm1,mm6
- movq mm5,mm4
- pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
- pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
- pmaddwd mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
- pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
- movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF]
-
- paddd mm6, MMWORD [wk(6)]
- paddd mm4, MMWORD [wk(7)]
- paddd mm6,mm2
- paddd mm4,mm2
- psrld mm6,SCALEBITS ; mm6=YEL
- psrld mm4,SCALEBITS ; mm4=YEH
- packssdw mm6,mm4 ; mm6=YE
-
- psllw mm0,BYTE_BIT
- por mm6,mm0 ; mm6=Y
- movq MMWORD [edi], mm6 ; Save Y
-
- pxor mm2,mm2
- pxor mm4,mm4
- punpcklwd mm2,mm3 ; mm2=REL
- punpckhwd mm4,mm3 ; mm4=REH
- psrld mm2,1 ; mm2=REL*FIX(0.500)
- psrld mm4,1 ; mm4=REH*FIX(0.500)
-
- movq mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
-
- paddd mm1,mm2
- paddd mm5,mm4
- paddd mm1,mm0
- paddd mm5,mm0
- psrld mm1,SCALEBITS ; mm1=CrEL
- psrld mm5,SCALEBITS ; mm5=CrEH
- packssdw mm1,mm5 ; mm1=CrE
-
- psllw mm7,BYTE_BIT
- por mm1,mm7 ; mm1=Cr
- movq MMWORD [edx], mm1 ; Save Cr
-
- sub ecx, byte SIZEOF_MMWORD
- add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr
- add edi, byte SIZEOF_MMWORD ; outptr0
- add ebx, byte SIZEOF_MMWORD ; outptr1
- add edx, byte SIZEOF_MMWORD ; outptr2
- cmp ecx, byte SIZEOF_MMWORD
- jae near .columnloop
- test ecx,ecx
- jnz near .column_ld1
-
- pop ecx ; col
- pop esi
- pop edi
- pop ebx
- pop edx
- poppic eax
-
- add esi, byte SIZEOF_JSAMPROW ; input_buf
- add edi, byte SIZEOF_JSAMPROW
- add ebx, byte SIZEOF_JSAMPROW
- add edx, byte SIZEOF_JSAMPROW
- dec eax ; num_rows
- jg near .rowloop
-
- emms ; empty MMX state
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_mmx jsimd_extrgb_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_mmx jsimd_extrgbx_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_mmx jsimd_extbgr_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_mmx jsimd_extbgrx_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_mmx jsimd_extxbgr_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_mmx jsimd_extxrgb_ycc_convert_mmx
+%include "jcclrmmx.asm"
diff --git a/simd/jccolss2.asm b/simd/jccolss2.asm
index 4d06bb8..2d7bb57 100644
--- a/simd/jccolss2.asm
+++ b/simd/jccolss2.asm
@@ -3,6 +3,7 @@
;
; x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2009, D. R. Commander.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
@@ -48,486 +49,70 @@ PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
-; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-; JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b) (b)+8 ; JDIMENSION img_width
-%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
-%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
-%define output_row(b) (b)+20 ; JDIMENSION output_row
-%define num_rows(b) (b)+24 ; int num_rows
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 8
-%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
-
- align 16
- global EXTN(jsimd_rgb_ycc_convert_sse2)
-
-EXTN(jsimd_rgb_ycc_convert_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic eax ; make a room for GOT address
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
- movpic POINTER [gotptr], ebx ; save GOT address
-
- mov ecx, JDIMENSION [img_width(eax)]
- test ecx,ecx
- jz near .return
-
- push ecx
-
- mov esi, JSAMPIMAGE [output_buf(eax)]
- mov ecx, JDIMENSION [output_row(eax)]
- mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
- lea edi, [edi+ecx*SIZEOF_JSAMPROW]
- lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
- lea edx, [edx+ecx*SIZEOF_JSAMPROW]
-
- pop ecx
-
- mov esi, JSAMPARRAY [input_buf(eax)]
- mov eax, INT [num_rows(eax)]
- test eax,eax
- jle near .return
- alignx 16,7
-.rowloop:
- pushpic eax
- push edx
- push ebx
- push edi
- push esi
- push ecx ; col
-
- mov esi, JSAMPROW [esi] ; inptr
- mov edi, JSAMPROW [edi] ; outptr0
- mov ebx, JSAMPROW [ebx] ; outptr1
- mov edx, JSAMPROW [edx] ; outptr2
- movpic eax, POINTER [gotptr] ; load GOT address (eax)
-
- cmp ecx, byte SIZEOF_XMMWORD
- jae near .columnloop
- alignx 16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
- push eax
- push edx
- lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
- test cl, SIZEOF_BYTE
- jz short .column_ld2
- sub ecx, byte SIZEOF_BYTE
- movzx eax, BYTE [esi+ecx]
-.column_ld2:
- test cl, SIZEOF_WORD
- jz short .column_ld4
- sub ecx, byte SIZEOF_WORD
- movzx edx, WORD [esi+ecx]
- shl eax, WORD_BIT
- or eax,edx
-.column_ld4:
- movd xmmA,eax
- pop edx
- pop eax
- test cl, SIZEOF_DWORD
- jz short .column_ld8
- sub ecx, byte SIZEOF_DWORD
- movd xmmF, XMM_DWORD [esi+ecx]
- pslldq xmmA, SIZEOF_DWORD
- por xmmA,xmmF
-.column_ld8:
- test cl, SIZEOF_MMWORD
- jz short .column_ld16
- sub ecx, byte SIZEOF_MMWORD
- movq xmmB, XMM_MMWORD [esi+ecx]
- pslldq xmmA, SIZEOF_MMWORD
- por xmmA,xmmB
-.column_ld16:
- test cl, SIZEOF_XMMWORD
- jz short .column_ld32
- movdqa xmmF,xmmA
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- mov ecx, SIZEOF_XMMWORD
- jmp short .rgb_ycc_cnv
-.column_ld32:
- test cl, 2*SIZEOF_XMMWORD
- mov ecx, SIZEOF_XMMWORD
- jz short .rgb_ycc_cnv
- movdqa xmmB,xmmA
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
- jmp short .rgb_ycc_cnv
- alignx 16,7
-
-.columnloop:
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
- movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
- ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
- ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
- ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
- movdqa xmmG,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
- psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
- punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
- pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
- punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
- punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
- movdqa xmmD,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
- psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
- punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
- pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
- punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
- punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
- movdqa xmmE,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
- psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
- punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
- pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
- punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
- punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
- pxor xmmH,xmmH
-
- movdqa xmmC,xmmA
- punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
- punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
- movdqa xmmB,xmmE
- punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
- punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
-
- movdqa xmmF,xmmD
- punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
- punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
- test cl, SIZEOF_XMMWORD/16
- jz short .column_ld2
- sub ecx, byte SIZEOF_XMMWORD/16
- movd xmmA, DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
- test cl, SIZEOF_XMMWORD/8
- jz short .column_ld4
- sub ecx, byte SIZEOF_XMMWORD/8
- movq xmmE, MMWORD [esi+ecx*RGB_PIXELSIZE]
- pslldq xmmA, SIZEOF_MMWORD
- por xmmA,xmmE
-.column_ld4:
- test cl, SIZEOF_XMMWORD/4
- jz short .column_ld8
- sub ecx, byte SIZEOF_XMMWORD/4
- movdqa xmmE,xmmA
- movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld8:
- test cl, SIZEOF_XMMWORD/2
- mov ecx, SIZEOF_XMMWORD
- jz short .rgb_ycc_cnv
- movdqa xmmF,xmmA
- movdqa xmmH,xmmE
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
- jmp short .rgb_ycc_cnv
- alignx 16,7
-
-.columnloop:
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
- movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
- ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
- ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
- ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
- ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
- movdqa xmmD,xmmA
- punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
- punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
- movdqa xmmC,xmmF
- punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
- punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
- movdqa xmmB,xmmA
- punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
- punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
- movdqa xmmG,xmmD
- punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
- punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
- movdqa xmmE,xmmA
- punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
- punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
- movdqa xmmH,xmmB
- punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
- punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
- pxor xmmF,xmmF
-
- movdqa xmmC,xmmA
- punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
- punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
- movdqa xmmD,xmmB
- punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
- punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
-
- movdqa xmmG,xmmE
- punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
- punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
-
- punpcklbw xmmF,xmmH
- punpckhbw xmmH,xmmH
- psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
- psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
- ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
- ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
- ; (Original)
- ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
- ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
- ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
- ;
- ; (This implementation)
- ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
- ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
- ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
- movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
- movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
- movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
- movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
-
- movdqa xmm6,xmm1
- punpcklwd xmm1,xmm3
- punpckhwd xmm6,xmm3
- movdqa xmm7,xmm1
- movdqa xmm4,xmm6
- pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
- pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
- pmaddwd xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
- pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
- movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
- movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
- pxor xmm1,xmm1
- pxor xmm6,xmm6
- punpcklwd xmm1,xmm5 ; xmm1=BOL
- punpckhwd xmm6,xmm5 ; xmm6=BOH
- psrld xmm1,1 ; xmm1=BOL*FIX(0.500)
- psrld xmm6,1 ; xmm6=BOH*FIX(0.500)
-
- movdqa xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
-
- paddd xmm7,xmm1
- paddd xmm4,xmm6
- paddd xmm7,xmm5
- paddd xmm4,xmm5
- psrld xmm7,SCALEBITS ; xmm7=CbOL
- psrld xmm4,SCALEBITS ; xmm4=CbOH
- packssdw xmm7,xmm4 ; xmm7=CbO
-
- movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
-
- movdqa xmm6,xmm0
- punpcklwd xmm0,xmm2
- punpckhwd xmm6,xmm2
- movdqa xmm5,xmm0
- movdqa xmm4,xmm6
- pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
- pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
- pmaddwd xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
- pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
- movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
- movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
- pxor xmm0,xmm0
- pxor xmm6,xmm6
- punpcklwd xmm0,xmm1 ; xmm0=BEL
- punpckhwd xmm6,xmm1 ; xmm6=BEH
- psrld xmm0,1 ; xmm0=BEL*FIX(0.500)
- psrld xmm6,1 ; xmm6=BEH*FIX(0.500)
-
- movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
-
- paddd xmm5,xmm0
- paddd xmm4,xmm6
- paddd xmm5,xmm1
- paddd xmm4,xmm1
- psrld xmm5,SCALEBITS ; xmm5=CbEL
- psrld xmm4,SCALEBITS ; xmm4=CbEH
- packssdw xmm5,xmm4 ; xmm5=CbE
-
- psllw xmm7,BYTE_BIT
- por xmm5,xmm7 ; xmm5=Cb
- movdqa XMMWORD [ebx], xmm5 ; Save Cb
-
- movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
- movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
- movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
-
- movdqa xmm4,xmm0
- punpcklwd xmm0,xmm3
- punpckhwd xmm4,xmm3
- movdqa xmm7,xmm0
- movdqa xmm5,xmm4
- pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
- pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
- pmaddwd xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
- pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
- movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
-
- paddd xmm0, XMMWORD [wk(4)]
- paddd xmm4, XMMWORD [wk(5)]
- paddd xmm0,xmm3
- paddd xmm4,xmm3
- psrld xmm0,SCALEBITS ; xmm0=YOL
- psrld xmm4,SCALEBITS ; xmm4=YOH
- packssdw xmm0,xmm4 ; xmm0=YO
-
- pxor xmm3,xmm3
- pxor xmm4,xmm4
- punpcklwd xmm3,xmm1 ; xmm3=ROL
- punpckhwd xmm4,xmm1 ; xmm4=ROH
- psrld xmm3,1 ; xmm3=ROL*FIX(0.500)
- psrld xmm4,1 ; xmm4=ROH*FIX(0.500)
-
- movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
-
- paddd xmm7,xmm3
- paddd xmm5,xmm4
- paddd xmm7,xmm1
- paddd xmm5,xmm1
- psrld xmm7,SCALEBITS ; xmm7=CrOL
- psrld xmm5,SCALEBITS ; xmm5=CrOH
- packssdw xmm7,xmm5 ; xmm7=CrO
-
- movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
-
- movdqa xmm4,xmm6
- punpcklwd xmm6,xmm2
- punpckhwd xmm4,xmm2
- movdqa xmm1,xmm6
- movdqa xmm5,xmm4
- pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
- pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
- pmaddwd xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
- pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
- movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
-
- paddd xmm6, XMMWORD [wk(6)]
- paddd xmm4, XMMWORD [wk(7)]
- paddd xmm6,xmm2
- paddd xmm4,xmm2
- psrld xmm6,SCALEBITS ; xmm6=YEL
- psrld xmm4,SCALEBITS ; xmm4=YEH
- packssdw xmm6,xmm4 ; xmm6=YE
-
- psllw xmm0,BYTE_BIT
- por xmm6,xmm0 ; xmm6=Y
- movdqa XMMWORD [edi], xmm6 ; Save Y
-
- pxor xmm2,xmm2
- pxor xmm4,xmm4
- punpcklwd xmm2,xmm3 ; xmm2=REL
- punpckhwd xmm4,xmm3 ; xmm4=REH
- psrld xmm2,1 ; xmm2=REL*FIX(0.500)
- psrld xmm4,1 ; xmm4=REH*FIX(0.500)
-
- movdqa xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
-
- paddd xmm1,xmm2
- paddd xmm5,xmm4
- paddd xmm1,xmm0
- paddd xmm5,xmm0
- psrld xmm1,SCALEBITS ; xmm1=CrEL
- psrld xmm5,SCALEBITS ; xmm5=CrEH
- packssdw xmm1,xmm5 ; xmm1=CrE
-
- psllw xmm7,BYTE_BIT
- por xmm1,xmm7 ; xmm1=Cr
- movdqa XMMWORD [edx], xmm1 ; Save Cr
-
- sub ecx, byte SIZEOF_XMMWORD
- add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
- add edi, byte SIZEOF_XMMWORD ; outptr0
- add ebx, byte SIZEOF_XMMWORD ; outptr1
- add edx, byte SIZEOF_XMMWORD ; outptr2
- cmp ecx, byte SIZEOF_XMMWORD
- jae near .columnloop
- test ecx,ecx
- jnz near .column_ld1
-
- pop ecx ; col
- pop esi
- pop edi
- pop ebx
- pop edx
- poppic eax
-
- add esi, byte SIZEOF_JSAMPROW ; input_buf
- add edi, byte SIZEOF_JSAMPROW
- add ebx, byte SIZEOF_JSAMPROW
- add edx, byte SIZEOF_JSAMPROW
- dec eax ; num_rows
- jg near .rowloop
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
+%include "jcclrss2.asm"
diff --git a/simd/jdclrmmx.asm b/simd/jdclrmmx.asm
new file mode 100644
index 0000000..506500e
--- /dev/null
+++ b/simd/jdclrmmx.asm
@@ -0,0 +1,402 @@
+;
+; jdclrmmx.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width,
+; JSAMPIMAGE input_buf, JDIMENSION input_row,
+; JSAMPARRAY output_buf, int num_rows)
+;
+
+%define out_width(b) (b)+8 ; JDIMENSION out_width
+%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
+%define input_row(b) (b)+16 ; JDIMENSION input_row
+%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
+%define num_rows(b) (b)+24 ; int num_rows
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 2
+%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
+
+ align 16
+ global EXTN(jsimd_ycc_rgb_convert_mmx)
+
+EXTN(jsimd_ycc_rgb_convert_mmx):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [out_width(eax)] ; num_cols
+ test ecx,ecx
+ jz near .return
+
+ push ecx
+
+ mov edi, JSAMPIMAGE [input_buf(eax)]
+ mov ecx, JDIMENSION [input_row(eax)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ lea esi, [esi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov edi, JSAMPARRAY [output_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax,eax
+ jle near .return
+ alignx 16,7
+.rowloop:
+ push eax
+ push edi
+ push edx
+ push ebx
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr0
+ mov ebx, JSAMPROW [ebx] ; inptr1
+ mov edx, JSAMPROW [edx] ; inptr2
+ mov edi, JSAMPROW [edi] ; outptr
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+ alignx 16,7
+.columnloop:
+
+ movq mm5, MMWORD [ebx] ; mm5=Cb(01234567)
+ movq mm1, MMWORD [edx] ; mm1=Cr(01234567)
+
+ pcmpeqw mm4,mm4
+ pcmpeqw mm7,mm7
+ psrlw mm4,BYTE_BIT
+ psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+ movq mm0,mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
+
+ pand mm4,mm5 ; mm4=Cb(0246)=CbE
+ psrlw mm5,BYTE_BIT ; mm5=Cb(1357)=CbO
+ pand mm0,mm1 ; mm0=Cr(0246)=CrE
+ psrlw mm1,BYTE_BIT ; mm1=Cr(1357)=CrO
+
+ paddw mm4,mm7
+ paddw mm5,mm7
+ paddw mm0,mm7
+ paddw mm1,mm7
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movq mm2,mm4 ; mm2=CbE
+ movq mm3,mm5 ; mm3=CbO
+ paddw mm4,mm4 ; mm4=2*CbE
+ paddw mm5,mm5 ; mm5=2*CbO
+ movq mm6,mm0 ; mm6=CrE
+ movq mm7,mm1 ; mm7=CrO
+ paddw mm0,mm0 ; mm0=2*CrE
+ paddw mm1,mm1 ; mm1=2*CrO
+
+ pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800))
+ pmulhw mm5,[GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800))
+ pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200))
+ pmulhw mm1,[GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200))
+
+ paddw mm4,[GOTOFF(eax,PW_ONE)]
+ paddw mm5,[GOTOFF(eax,PW_ONE)]
+ psraw mm4,1 ; mm4=(CbE * -FIX(0.22800))
+ psraw mm5,1 ; mm5=(CbO * -FIX(0.22800))
+ paddw mm0,[GOTOFF(eax,PW_ONE)]
+ paddw mm1,[GOTOFF(eax,PW_ONE)]
+ psraw mm0,1 ; mm0=(CrE * FIX(0.40200))
+ psraw mm1,1 ; mm1=(CrO * FIX(0.40200))
+
+ paddw mm4,mm2
+ paddw mm5,mm3
+ paddw mm4,mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E
+ paddw mm5,mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O
+ paddw mm0,mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E
+ paddw mm1,mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E
+ movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O
+
+ movq mm4,mm2
+ movq mm5,mm3
+ punpcklwd mm2,mm6
+ punpckhwd mm4,mm6
+ pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd mm4,[GOTOFF(eax,PW_MF0344_F0285)]
+ punpcklwd mm3,mm7
+ punpckhwd mm5,mm7
+ pmaddwd mm3,[GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)]
+
+ paddd mm2,[GOTOFF(eax,PD_ONEHALF)]
+ paddd mm4,[GOTOFF(eax,PD_ONEHALF)]
+ psrad mm2,SCALEBITS
+ psrad mm4,SCALEBITS
+ paddd mm3,[GOTOFF(eax,PD_ONEHALF)]
+ paddd mm5,[GOTOFF(eax,PD_ONEHALF)]
+ psrad mm3,SCALEBITS
+ psrad mm5,SCALEBITS
+
+ packssdw mm2,mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+ packssdw mm3,mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+ psubw mm2,mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+ psubw mm3,mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+ movq mm5, MMWORD [esi] ; mm5=Y(01234567)
+
+ pcmpeqw mm4,mm4
+ psrlw mm4,BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..}
+ pand mm4,mm5 ; mm4=Y(0246)=YE
+ psrlw mm5,BYTE_BIT ; mm5=Y(1357)=YO
+
+ paddw mm0,mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
+ paddw mm1,mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
+ packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **)
+ packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+ paddw mm2,mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
+ paddw mm3,mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
+ packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **)
+ packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+ paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
+ paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
+ packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **)
+ packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+ ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+ ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+ ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+ punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16)
+ punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07)
+ punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27)
+
+ movq mmG,mmA
+ movq mmH,mmA
+ punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03)
+ punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07)
+
+ psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --)
+ psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --)
+
+ movq mmC,mmD
+ movq mmB,mmD
+ punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14)
+ punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --)
+
+ psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --)
+
+ movq mmF,mmE
+ punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25)
+ punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --)
+
+ punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12)
+ punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05)
+ punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27)
+
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st16
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+ sub ecx, byte SIZEOF_MMWORD
+ jz short .nextrow
+
+ add esi, byte SIZEOF_MMWORD ; inptr0
+ add ebx, byte SIZEOF_MMWORD ; inptr1
+ add edx, byte SIZEOF_MMWORD ; inptr2
+ add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
+ jmp near .columnloop
+ alignx 16,7
+
+.column_st16:
+ lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp ecx, byte 2*SIZEOF_MMWORD
+ jb short .column_st8
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
+ movq mmA,mmC
+ sub ecx, byte 2*SIZEOF_MMWORD
+ add edi, byte 2*SIZEOF_MMWORD
+ jmp short .column_st4
+.column_st8:
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st4
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq mmA,mmE
+ sub ecx, byte SIZEOF_MMWORD
+ add edi, byte SIZEOF_MMWORD
+.column_st4:
+ movd eax,mmA
+ cmp ecx, byte SIZEOF_DWORD
+ jb short .column_st2
+ mov DWORD [edi+0*SIZEOF_DWORD], eax
+ psrlq mmA,DWORD_BIT
+ movd eax,mmA
+ sub ecx, byte SIZEOF_DWORD
+ add edi, byte SIZEOF_DWORD
+.column_st2:
+ cmp ecx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov WORD [edi+0*SIZEOF_WORD], ax
+ shr eax,WORD_BIT
+ sub ecx, byte SIZEOF_WORD
+ add edi, byte SIZEOF_WORD
+.column_st1:
+ cmp ecx, byte SIZEOF_BYTE
+ jb short .nextrow
+ mov BYTE [edi+0*SIZEOF_BYTE], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
+ pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+ pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
+ pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+ ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+ ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+ ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+ ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+ punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16)
+ punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36)
+ punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17)
+ punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37)
+
+ movq mmC,mmA
+ punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32)
+ punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36)
+ movq mmG,mmB
+ punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33)
+ punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37)
+
+ movq mmD,mmA
+ punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31)
+ punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33)
+ movq mmH,mmC
+ punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35)
+ punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37)
+
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st16
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
+ movq MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+ sub ecx, byte SIZEOF_MMWORD
+ jz short .nextrow
+
+ add esi, byte SIZEOF_MMWORD ; inptr0
+ add ebx, byte SIZEOF_MMWORD ; inptr1
+ add edx, byte SIZEOF_MMWORD ; inptr2
+ add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
+ jmp near .columnloop
+ alignx 16,7
+
+.column_st16:
+ cmp ecx, byte SIZEOF_MMWORD/2
+ jb short .column_st8
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
+ movq mmA,mmC
+ movq mmD,mmH
+ sub ecx, byte SIZEOF_MMWORD/2
+ add edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+ cmp ecx, byte SIZEOF_MMWORD/4
+ jb short .column_st4
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq mmA,mmD
+ sub ecx, byte SIZEOF_MMWORD/4
+ add edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+ cmp ecx, byte SIZEOF_MMWORD/8
+ jb short .nextrow
+ movd DWORD [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ alignx 16,7
+
+.nextrow:
+ pop ecx
+ pop esi
+ pop ebx
+ pop edx
+ pop edi
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ add edi, byte SIZEOF_JSAMPROW ; output_buf
+ dec eax ; num_rows
+ jg near .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
diff --git a/simd/jdclrss2.asm b/simd/jdclrss2.asm
new file mode 100644
index 0000000..b3ef195
--- /dev/null
+++ b/simd/jdclrss2.asm
@@ -0,0 +1,500 @@
+;
+; jdclrss2.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
+; JSAMPIMAGE input_buf, JDIMENSION input_row,
+; JSAMPARRAY output_buf, int num_rows)
+;
+
+%define out_width(b) (b)+8 ; JDIMENSION out_width
+%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
+%define input_row(b) (b)+16 ; JDIMENSION input_row
+%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
+%define num_rows(b) (b)+24 ; int num_rows
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
+
+ align 16
+ global EXTN(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [out_width(eax)] ; num_cols
+ test ecx,ecx
+ jz near .return
+
+ push ecx
+
+ mov edi, JSAMPIMAGE [input_buf(eax)]
+ mov ecx, JDIMENSION [input_row(eax)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ lea esi, [esi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov edi, JSAMPARRAY [output_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax,eax
+ jle near .return
+ alignx 16,7
+.rowloop:
+ push eax
+ push edi
+ push edx
+ push ebx
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr0
+ mov ebx, JSAMPROW [ebx] ; inptr1
+ mov edx, JSAMPROW [edx] ; inptr2
+ mov edi, JSAMPROW [edi] ; outptr
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+ alignx 16,7
+.columnloop:
+
+ movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF)
+ movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF)
+
+ pcmpeqw xmm4,xmm4
+ pcmpeqw xmm7,xmm7
+ psrlw xmm4,BYTE_BIT
+ psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+ movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+ pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE
+ psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
+ pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE
+ psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
+
+ paddw xmm4,xmm7
+ paddw xmm5,xmm7
+ paddw xmm0,xmm7
+ paddw xmm1,xmm7
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movdqa xmm2,xmm4 ; xmm2=CbE
+ movdqa xmm3,xmm5 ; xmm3=CbO
+ paddw xmm4,xmm4 ; xmm4=2*CbE
+ paddw xmm5,xmm5 ; xmm5=2*CbO
+ movdqa xmm6,xmm0 ; xmm6=CrE
+ movdqa xmm7,xmm1 ; xmm7=CrO
+ paddw xmm0,xmm0 ; xmm0=2*CrE
+ paddw xmm1,xmm1 ; xmm1=2*CrO
+
+ pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800))
+ pmulhw xmm5,[GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800))
+ pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200))
+ pmulhw xmm1,[GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200))
+
+ paddw xmm4,[GOTOFF(eax,PW_ONE)]
+ paddw xmm5,[GOTOFF(eax,PW_ONE)]
+ psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800))
+ psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800))
+ paddw xmm0,[GOTOFF(eax,PW_ONE)]
+ paddw xmm1,[GOTOFF(eax,PW_ONE)]
+ psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200))
+ psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200))
+
+ paddw xmm4,xmm2
+ paddw xmm5,xmm3
+ paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+ paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+ paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+ paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
+
+ movdqa xmm4,xmm2
+ movdqa xmm5,xmm3
+ punpcklwd xmm2,xmm6
+ punpckhwd xmm4,xmm6
+ pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd xmm4,[GOTOFF(eax,PW_MF0344_F0285)]
+ punpcklwd xmm3,xmm7
+ punpckhwd xmm5,xmm7
+ pmaddwd xmm3,[GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
+
+ paddd xmm2,[GOTOFF(eax,PD_ONEHALF)]
+ paddd xmm4,[GOTOFF(eax,PD_ONEHALF)]
+ psrad xmm2,SCALEBITS
+ psrad xmm4,SCALEBITS
+ paddd xmm3,[GOTOFF(eax,PD_ONEHALF)]
+ paddd xmm5,[GOTOFF(eax,PD_ONEHALF)]
+ psrad xmm3,SCALEBITS
+ psrad xmm5,SCALEBITS
+
+ packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+ packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+ psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+ psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+ movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF)
+
+ pcmpeqw xmm4,xmm4
+ psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+ pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE
+ psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO
+
+ paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+ paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+ packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
+ packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
+
+ paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+ paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+ packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
+ packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
+
+ paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+ paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+ packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
+ packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+ punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+ punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+ movdqa xmmG,xmmA
+ movdqa xmmH,xmmA
+ punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+ punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+ psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+ psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+ movdqa xmmC,xmmD
+ movdqa xmmB,xmmD
+ punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+ punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+ psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+ movdqa xmmF,xmmE
+ punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+ punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+ pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+ movdqa xmmB,xmmE
+ punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+ punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+ punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+ pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+ movdqa xmmB,xmmF
+ punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+ punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+ punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+ punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test edi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
+ maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
+ add edi, byte SIZEOF_XMMWORD ; outptr
+.out0:
+ sub ecx, byte SIZEOF_XMMWORD
+ jz near .nextrow
+
+ add esi, byte SIZEOF_XMMWORD ; inptr0
+ add ebx, byte SIZEOF_XMMWORD ; inptr1
+ add edx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16,7
+
+.column_st32:
+ pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
+ lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp ecx, byte 2*SIZEOF_XMMWORD
+ jb short .column_st16
+ maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA,xmmF
+ sub ecx, byte 2*SIZEOF_XMMWORD
+ jmp short .column_st15
+.column_st16:
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA,xmmD
+ sub ecx, byte SIZEOF_XMMWORD
+.column_st15:
+ mov eax,ecx
+ xor ecx, byte 0x0F
+ shl ecx, 2
+ movd xmmB,ecx
+ psrlq xmmH,4
+ pcmpeqb xmmE,xmmE
+ psrlq xmmH,xmmB
+ psrlq xmmE,xmmB
+ punpcklbw xmmE,xmmH
+ ; ----------------
+ mov ecx,edi
+ and ecx, byte SIZEOF_XMMWORD-1
+ jz short .adj0
+ add eax,ecx
+ cmp eax, byte SIZEOF_XMMWORD
+ ja short .adj0
+ and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
+ shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
+ movdqa xmmG,xmmA
+ movdqa xmmC,xmmE
+ pslldq xmmA, SIZEOF_XMMWORD/2
+ pslldq xmmE, SIZEOF_XMMWORD/2
+ movd xmmD,ecx
+ sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+ jb short .adj1
+ movd xmmF,ecx
+ psllq xmmA,xmmF
+ psllq xmmE,xmmF
+ jmp short .adj0
+.adj1: neg ecx
+ movd xmmF,ecx
+ psrlq xmmA,xmmF
+ psrlq xmmE,xmmF
+ psllq xmmG,xmmD
+ psllq xmmC,xmmD
+ por xmmA,xmmG
+ por xmmE,xmmC
+.adj0: ; ----------------
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
+ pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
+%else
+ pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
+ pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
+%endif
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+ punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+ punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+ punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+ movdqa xmmC,xmmA
+ punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+ punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+ movdqa xmmG,xmmB
+ punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+ punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+ movdqa xmmD,xmmA
+ punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ movdqa xmmH,xmmC
+ punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test edi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+ movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
+ add edi, byte SIZEOF_XMMWORD ; outptr
+.out0:
+ sub ecx, byte SIZEOF_XMMWORD
+ jz near .nextrow
+
+ add esi, byte SIZEOF_XMMWORD ; inptr0
+ add ebx, byte SIZEOF_XMMWORD ; inptr1
+ add edx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16,7
+
+.column_st32:
+ pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
+ cmp ecx, byte SIZEOF_XMMWORD/2
+ jb short .column_st16
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA,xmmC
+ movdqa xmmD,xmmH
+ sub ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+ cmp ecx, byte SIZEOF_XMMWORD/4
+ jb short .column_st15
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA,xmmD
+ sub ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+ cmp ecx, byte SIZEOF_XMMWORD/16
+ jb short .nextrow
+ mov eax,ecx
+ xor ecx, byte 0x03
+ inc ecx
+ shl ecx, 4
+ movd xmmF,ecx
+ psrlq xmmE,xmmF
+ punpcklbw xmmE,xmmE
+ ; ----------------
+ mov ecx,edi
+ and ecx, byte SIZEOF_XMMWORD-1
+ jz short .adj0
+ lea eax, [ecx+eax*4] ; RGB_PIXELSIZE
+ cmp eax, byte SIZEOF_XMMWORD
+ ja short .adj0
+ and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
+ shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
+ movdqa xmmB,xmmA
+ movdqa xmmG,xmmE
+ pslldq xmmA, SIZEOF_XMMWORD/2
+ pslldq xmmE, SIZEOF_XMMWORD/2
+ movd xmmC,ecx
+ sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+ jb short .adj1
+ movd xmmH,ecx
+ psllq xmmA,xmmH
+ psllq xmmE,xmmH
+ jmp short .adj0
+.adj1: neg ecx
+ movd xmmH,ecx
+ psrlq xmmA,xmmH
+ psrlq xmmE,xmmH
+ psllq xmmB,xmmC
+ psllq xmmG,xmmC
+ por xmmA,xmmB
+ por xmmE,xmmG
+.adj0: ; ----------------
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ alignx 16,7
+
+.nextrow:
+ pop ecx
+ pop esi
+ pop ebx
+ pop edx
+ pop edi
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ add edi, byte SIZEOF_JSAMPROW ; output_buf
+ dec eax ; num_rows
+ jg near .rowloop
+
+ sfence ; flush the write buffer
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
diff --git a/simd/jdcolmmx.asm b/simd/jdcolmmx.asm
index 742c9ce..f34387b 100644
--- a/simd/jdcolmmx.asm
+++ b/simd/jdcolmmx.asm
@@ -2,6 +2,7 @@
; jdcolmmx.asm - colorspace conversion (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
;
; Based on
; x86 SIMD extension for IJG JPEG library
@@ -48,386 +49,70 @@ PD_ONEHALF times 2 dd 1 << (SCALEBITS-1)
alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width,
-; JSAMPIMAGE input_buf, JDIMENSION input_row,
-; JSAMPARRAY output_buf, int num_rows)
-;
-
-%define out_width(b) (b)+8 ; JDIMENSION out_width
-%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
-%define input_row(b) (b)+16 ; JDIMENSION input_row
-%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
-%define num_rows(b) (b)+24 ; int num_rows
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
-%define WK_NUM 2
-%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
-
- align 16
- global EXTN(jsimd_ycc_rgb_convert_mmx)
-
-EXTN(jsimd_ycc_rgb_convert_mmx):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic eax ; make a room for GOT address
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
- movpic POINTER [gotptr], ebx ; save GOT address
-
- mov ecx, JDIMENSION [out_width(eax)] ; num_cols
- test ecx,ecx
- jz near .return
-
- push ecx
-
- mov edi, JSAMPIMAGE [input_buf(eax)]
- mov ecx, JDIMENSION [input_row(eax)]
- mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
- lea esi, [esi+ecx*SIZEOF_JSAMPROW]
- lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
- lea edx, [edx+ecx*SIZEOF_JSAMPROW]
-
- pop ecx
-
- mov edi, JSAMPARRAY [output_buf(eax)]
- mov eax, INT [num_rows(eax)]
- test eax,eax
- jle near .return
- alignx 16,7
-.rowloop:
- push eax
- push edi
- push edx
- push ebx
- push esi
- push ecx ; col
-
- mov esi, JSAMPROW [esi] ; inptr0
- mov ebx, JSAMPROW [ebx] ; inptr1
- mov edx, JSAMPROW [edx] ; inptr2
- mov edi, JSAMPROW [edi] ; outptr
- movpic eax, POINTER [gotptr] ; load GOT address (eax)
- alignx 16,7
-.columnloop:
-
- movq mm5, MMWORD [ebx] ; mm5=Cb(01234567)
- movq mm1, MMWORD [edx] ; mm1=Cr(01234567)
-
- pcmpeqw mm4,mm4
- pcmpeqw mm7,mm7
- psrlw mm4,BYTE_BIT
- psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
- movq mm0,mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
-
- pand mm4,mm5 ; mm4=Cb(0246)=CbE
- psrlw mm5,BYTE_BIT ; mm5=Cb(1357)=CbO
- pand mm0,mm1 ; mm0=Cr(0246)=CrE
- psrlw mm1,BYTE_BIT ; mm1=Cr(1357)=CrO
-
- paddw mm4,mm7
- paddw mm5,mm7
- paddw mm0,mm7
- paddw mm1,mm7
-
- ; (Original)
- ; R = Y + 1.40200 * Cr
- ; G = Y - 0.34414 * Cb - 0.71414 * Cr
- ; B = Y + 1.77200 * Cb
- ;
- ; (This implementation)
- ; R = Y + 0.40200 * Cr + Cr
- ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
- ; B = Y - 0.22800 * Cb + Cb + Cb
-
- movq mm2,mm4 ; mm2=CbE
- movq mm3,mm5 ; mm3=CbO
- paddw mm4,mm4 ; mm4=2*CbE
- paddw mm5,mm5 ; mm5=2*CbO
- movq mm6,mm0 ; mm6=CrE
- movq mm7,mm1 ; mm7=CrO
- paddw mm0,mm0 ; mm0=2*CrE
- paddw mm1,mm1 ; mm1=2*CrO
-
- pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800))
- pmulhw mm5,[GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800))
- pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200))
- pmulhw mm1,[GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200))
-
- paddw mm4,[GOTOFF(eax,PW_ONE)]
- paddw mm5,[GOTOFF(eax,PW_ONE)]
- psraw mm4,1 ; mm4=(CbE * -FIX(0.22800))
- psraw mm5,1 ; mm5=(CbO * -FIX(0.22800))
- paddw mm0,[GOTOFF(eax,PW_ONE)]
- paddw mm1,[GOTOFF(eax,PW_ONE)]
- psraw mm0,1 ; mm0=(CrE * FIX(0.40200))
- psraw mm1,1 ; mm1=(CrO * FIX(0.40200))
-
- paddw mm4,mm2
- paddw mm5,mm3
- paddw mm4,mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E
- paddw mm5,mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O
- paddw mm0,mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E
- paddw mm1,mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O
-
- movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E
- movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O
-
- movq mm4,mm2
- movq mm5,mm3
- punpcklwd mm2,mm6
- punpckhwd mm4,mm6
- pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd mm4,[GOTOFF(eax,PW_MF0344_F0285)]
- punpcklwd mm3,mm7
- punpckhwd mm5,mm7
- pmaddwd mm3,[GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)]
-
- paddd mm2,[GOTOFF(eax,PD_ONEHALF)]
- paddd mm4,[GOTOFF(eax,PD_ONEHALF)]
- psrad mm2,SCALEBITS
- psrad mm4,SCALEBITS
- paddd mm3,[GOTOFF(eax,PD_ONEHALF)]
- paddd mm5,[GOTOFF(eax,PD_ONEHALF)]
- psrad mm3,SCALEBITS
- psrad mm5,SCALEBITS
-
- packssdw mm2,mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
- packssdw mm3,mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
- psubw mm2,mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
- psubw mm3,mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
- movq mm5, MMWORD [esi] ; mm5=Y(01234567)
-
- pcmpeqw mm4,mm4
- psrlw mm4,BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..}
- pand mm4,mm5 ; mm4=Y(0246)=YE
- psrlw mm5,BYTE_BIT ; mm5=Y(1357)=YO
-
- paddw mm0,mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
- paddw mm1,mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
- packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **)
- packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **)
-
- paddw mm2,mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
- paddw mm3,mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
- packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **)
- packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **)
-
- paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
- paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
- packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **)
- packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
- ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
- ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
- ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
- ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
-
- punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16)
- punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07)
- punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27)
-
- movq mmG,mmA
- movq mmH,mmA
- punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03)
- punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07)
-
- psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --)
- psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --)
-
- movq mmC,mmD
- movq mmB,mmD
- punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14)
- punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --)
-
- psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --)
-
- movq mmF,mmE
- punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25)
- punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --)
-
- punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12)
- punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05)
- punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27)
-
- cmp ecx, byte SIZEOF_MMWORD
- jb short .column_st16
-
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
- movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
-
- sub ecx, byte SIZEOF_MMWORD
- jz short .nextrow
-
- add esi, byte SIZEOF_MMWORD ; inptr0
- add ebx, byte SIZEOF_MMWORD ; inptr1
- add edx, byte SIZEOF_MMWORD ; inptr2
- add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
- jmp near .columnloop
- alignx 16,7
-
-.column_st16:
- lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
- cmp ecx, byte 2*SIZEOF_MMWORD
- jb short .column_st8
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
- movq mmA,mmC
- sub ecx, byte 2*SIZEOF_MMWORD
- add edi, byte 2*SIZEOF_MMWORD
- jmp short .column_st4
-.column_st8:
- cmp ecx, byte SIZEOF_MMWORD
- jb short .column_st4
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq mmA,mmE
- sub ecx, byte SIZEOF_MMWORD
- add edi, byte SIZEOF_MMWORD
-.column_st4:
- movd eax,mmA
- cmp ecx, byte SIZEOF_DWORD
- jb short .column_st2
- mov DWORD [edi+0*SIZEOF_DWORD], eax
- psrlq mmA,DWORD_BIT
- movd eax,mmA
- sub ecx, byte SIZEOF_DWORD
- add edi, byte SIZEOF_DWORD
-.column_st2:
- cmp ecx, byte SIZEOF_WORD
- jb short .column_st1
- mov WORD [edi+0*SIZEOF_WORD], ax
- shr eax,WORD_BIT
- sub ecx, byte SIZEOF_WORD
- add edi, byte SIZEOF_WORD
-.column_st1:
- cmp ecx, byte SIZEOF_BYTE
- jb short .nextrow
- mov BYTE [edi+0*SIZEOF_BYTE], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
- pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
- pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
-%else
- pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
- pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
-%endif
- ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
- ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
- ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
- ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
-
- punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16)
- punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36)
- punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17)
- punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37)
-
- movq mmC,mmA
- punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32)
- punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36)
- movq mmG,mmB
- punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33)
- punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37)
-
- movq mmD,mmA
- punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31)
- punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33)
- movq mmH,mmC
- punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35)
- punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37)
-
- cmp ecx, byte SIZEOF_MMWORD
- jb short .column_st16
-
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
- movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
- movq MMWORD [edi+3*SIZEOF_MMWORD], mmH
-
- sub ecx, byte SIZEOF_MMWORD
- jz short .nextrow
-
- add esi, byte SIZEOF_MMWORD ; inptr0
- add ebx, byte SIZEOF_MMWORD ; inptr1
- add edx, byte SIZEOF_MMWORD ; inptr2
- add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
- jmp near .columnloop
- alignx 16,7
-
-.column_st16:
- cmp ecx, byte SIZEOF_MMWORD/2
- jb short .column_st8
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
- movq mmA,mmC
- movq mmD,mmH
- sub ecx, byte SIZEOF_MMWORD/2
- add edi, byte 2*SIZEOF_MMWORD
-.column_st8:
- cmp ecx, byte SIZEOF_MMWORD/4
- jb short .column_st4
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq mmA,mmD
- sub ecx, byte SIZEOF_MMWORD/4
- add edi, byte 1*SIZEOF_MMWORD
-.column_st4:
- cmp ecx, byte SIZEOF_MMWORD/8
- jb short .nextrow
- movd DWORD [edi+0*SIZEOF_DWORD], mmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
- alignx 16,7
-
-.nextrow:
- pop ecx
- pop esi
- pop ebx
- pop edx
- pop edi
- pop eax
-
- add esi, byte SIZEOF_JSAMPROW
- add ebx, byte SIZEOF_JSAMPROW
- add edx, byte SIZEOF_JSAMPROW
- add edi, byte SIZEOF_JSAMPROW ; output_buf
- dec eax ; num_rows
- jg near .rowloop
-
- emms ; empty MMX state
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgb_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgbx_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgr_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgrx_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxbgr_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxrgb_convert_mmx
+%include "jdclrmmx.asm"
diff --git a/simd/jdcolss2.asm b/simd/jdcolss2.asm
index 6f74cc6..b13c944 100644
--- a/simd/jdcolss2.asm
+++ b/simd/jdcolss2.asm
@@ -2,6 +2,7 @@
; jdcolss2.asm - colorspace conversion (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
;
; Based on
; x86 SIMD extension for IJG JPEG library
@@ -48,484 +49,70 @@ PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
-; JSAMPIMAGE input_buf, JDIMENSION input_row,
-; JSAMPARRAY output_buf, int num_rows)
-;
-
-%define out_width(b) (b)+8 ; JDIMENSION out_width
-%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
-%define input_row(b) (b)+16 ; JDIMENSION input_row
-%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
-%define num_rows(b) (b)+24 ; int num_rows
-
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
-%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
-
- align 16
- global EXTN(jsimd_ycc_rgb_convert_sse2)
-
-EXTN(jsimd_ycc_rgb_convert_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic eax ; make a room for GOT address
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
- movpic POINTER [gotptr], ebx ; save GOT address
-
- mov ecx, JDIMENSION [out_width(eax)] ; num_cols
- test ecx,ecx
- jz near .return
-
- push ecx
-
- mov edi, JSAMPIMAGE [input_buf(eax)]
- mov ecx, JDIMENSION [input_row(eax)]
- mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
- lea esi, [esi+ecx*SIZEOF_JSAMPROW]
- lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
- lea edx, [edx+ecx*SIZEOF_JSAMPROW]
-
- pop ecx
-
- mov edi, JSAMPARRAY [output_buf(eax)]
- mov eax, INT [num_rows(eax)]
- test eax,eax
- jle near .return
- alignx 16,7
-.rowloop:
- push eax
- push edi
- push edx
- push ebx
- push esi
- push ecx ; col
-
- mov esi, JSAMPROW [esi] ; inptr0
- mov ebx, JSAMPROW [ebx] ; inptr1
- mov edx, JSAMPROW [edx] ; inptr2
- mov edi, JSAMPROW [edi] ; outptr
- movpic eax, POINTER [gotptr] ; load GOT address (eax)
- alignx 16,7
-.columnloop:
-
- movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF)
- movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF)
-
- pcmpeqw xmm4,xmm4
- pcmpeqw xmm7,xmm7
- psrlw xmm4,BYTE_BIT
- psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
- movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
-
- pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE
- psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
- pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE
- psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
-
- paddw xmm4,xmm7
- paddw xmm5,xmm7
- paddw xmm0,xmm7
- paddw xmm1,xmm7
-
- ; (Original)
- ; R = Y + 1.40200 * Cr
- ; G = Y - 0.34414 * Cb - 0.71414 * Cr
- ; B = Y + 1.77200 * Cb
- ;
- ; (This implementation)
- ; R = Y + 0.40200 * Cr + Cr
- ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
- ; B = Y - 0.22800 * Cb + Cb + Cb
-
- movdqa xmm2,xmm4 ; xmm2=CbE
- movdqa xmm3,xmm5 ; xmm3=CbO
- paddw xmm4,xmm4 ; xmm4=2*CbE
- paddw xmm5,xmm5 ; xmm5=2*CbO
- movdqa xmm6,xmm0 ; xmm6=CrE
- movdqa xmm7,xmm1 ; xmm7=CrO
- paddw xmm0,xmm0 ; xmm0=2*CrE
- paddw xmm1,xmm1 ; xmm1=2*CrO
-
- pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800))
- pmulhw xmm5,[GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800))
- pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200))
- pmulhw xmm1,[GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200))
-
- paddw xmm4,[GOTOFF(eax,PW_ONE)]
- paddw xmm5,[GOTOFF(eax,PW_ONE)]
- psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800))
- psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800))
- paddw xmm0,[GOTOFF(eax,PW_ONE)]
- paddw xmm1,[GOTOFF(eax,PW_ONE)]
- psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200))
- psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200))
-
- paddw xmm4,xmm2
- paddw xmm5,xmm3
- paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
- paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
- paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
- paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
-
- movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
- movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
-
- movdqa xmm4,xmm2
- movdqa xmm5,xmm3
- punpcklwd xmm2,xmm6
- punpckhwd xmm4,xmm6
- pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd xmm4,[GOTOFF(eax,PW_MF0344_F0285)]
- punpcklwd xmm3,xmm7
- punpckhwd xmm5,xmm7
- pmaddwd xmm3,[GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
-
- paddd xmm2,[GOTOFF(eax,PD_ONEHALF)]
- paddd xmm4,[GOTOFF(eax,PD_ONEHALF)]
- psrad xmm2,SCALEBITS
- psrad xmm4,SCALEBITS
- paddd xmm3,[GOTOFF(eax,PD_ONEHALF)]
- paddd xmm5,[GOTOFF(eax,PD_ONEHALF)]
- psrad xmm3,SCALEBITS
- psrad xmm5,SCALEBITS
-
- packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
- packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
- psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
- psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
- movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF)
-
- pcmpeqw xmm4,xmm4
- psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
- pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE
- psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO
-
- paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
- paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
- packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
- packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
-
- paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
- paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
- packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
- packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
-
- paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
- paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
- packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
- packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
- ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
- ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
- ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
- ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
- punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
- punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
- punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
- movdqa xmmG,xmmA
- movdqa xmmH,xmmA
- punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
- punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
- psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
- psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
- movdqa xmmC,xmmD
- movdqa xmmB,xmmD
- punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
- punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
- psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
- movdqa xmmF,xmmE
- punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
- punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
- pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
- movdqa xmmB,xmmE
- punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
- punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
- punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
- pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
- movdqa xmmB,xmmF
- punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
- punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
- punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
- punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
- punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
- punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
- cmp ecx, byte SIZEOF_XMMWORD
- jb short .column_st32
-
- test edi, SIZEOF_XMMWORD-1
- jnz short .out1
- ; --(aligned)-------------------
- movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
- jmp short .out0
-.out1: ; --(unaligned)-----------------
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
- add edi, byte SIZEOF_XMMWORD ; outptr
-.out0:
- sub ecx, byte SIZEOF_XMMWORD
- jz near .nextrow
-
- add esi, byte SIZEOF_XMMWORD ; inptr0
- add ebx, byte SIZEOF_XMMWORD ; inptr1
- add edx, byte SIZEOF_XMMWORD ; inptr2
- jmp near .columnloop
- alignx 16,7
-
-.column_st32:
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
- lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
- cmp ecx, byte 2*SIZEOF_XMMWORD
- jb short .column_st16
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmF
- sub ecx, byte 2*SIZEOF_XMMWORD
- jmp short .column_st15
-.column_st16:
- cmp ecx, byte SIZEOF_XMMWORD
- jb short .column_st15
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmD
- sub ecx, byte SIZEOF_XMMWORD
-.column_st15:
- mov eax,ecx
- xor ecx, byte 0x0F
- shl ecx, 2
- movd xmmB,ecx
- psrlq xmmH,4
- pcmpeqb xmmE,xmmE
- psrlq xmmH,xmmB
- psrlq xmmE,xmmB
- punpcklbw xmmE,xmmH
- ; ----------------
- mov ecx,edi
- and ecx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- add eax,ecx
- cmp eax, byte SIZEOF_XMMWORD
- ja short .adj0
- and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmG,xmmA
- movdqa xmmC,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmD,ecx
- sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmF,ecx
- psllq xmmA,xmmF
- psllq xmmE,xmmF
- jmp short .adj0
-.adj1: neg ecx
- movd xmmF,ecx
- psrlq xmmA,xmmF
- psrlq xmmE,xmmF
- psllq xmmG,xmmD
- psllq xmmC,xmmD
- por xmmA,xmmG
- por xmmE,xmmC
-.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
- pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
- pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
-%else
- pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
- pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
-%endif
- ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
- ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
- ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
- ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
- punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
- punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
- punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
- punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
- movdqa xmmC,xmmA
- punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
- punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
- movdqa xmmG,xmmB
- punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
- punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
- movdqa xmmD,xmmA
- punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
- punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
- movdqa xmmH,xmmC
- punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
- punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
- cmp ecx, byte SIZEOF_XMMWORD
- jb short .column_st32
-
- test edi, SIZEOF_XMMWORD-1
- jnz short .out1
- ; --(aligned)-------------------
- movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
- movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
- jmp short .out0
-.out1: ; --(unaligned)-----------------
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
- add edi, byte SIZEOF_XMMWORD ; outptr
-.out0:
- sub ecx, byte SIZEOF_XMMWORD
- jz near .nextrow
-
- add esi, byte SIZEOF_XMMWORD ; inptr0
- add ebx, byte SIZEOF_XMMWORD ; inptr1
- add edx, byte SIZEOF_XMMWORD ; inptr2
- jmp near .columnloop
- alignx 16,7
-
-.column_st32:
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
- cmp ecx, byte SIZEOF_XMMWORD/2
- jb short .column_st16
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmC
- movdqa xmmD,xmmH
- sub ecx, byte SIZEOF_XMMWORD/2
-.column_st16:
- cmp ecx, byte SIZEOF_XMMWORD/4
- jb short .column_st15
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmD
- sub ecx, byte SIZEOF_XMMWORD/4
-.column_st15:
- cmp ecx, byte SIZEOF_XMMWORD/16
- jb short .nextrow
- mov eax,ecx
- xor ecx, byte 0x03
- inc ecx
- shl ecx, 4
- movd xmmF,ecx
- psrlq xmmE,xmmF
- punpcklbw xmmE,xmmE
- ; ----------------
- mov ecx,edi
- and ecx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- lea eax, [ecx+eax*4] ; RGB_PIXELSIZE
- cmp eax, byte SIZEOF_XMMWORD
- ja short .adj0
- and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmB,xmmA
- movdqa xmmG,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmC,ecx
- sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmH,ecx
- psllq xmmA,xmmH
- psllq xmmE,xmmH
- jmp short .adj0
-.adj1: neg ecx
- movd xmmH,ecx
- psrlq xmmA,xmmH
- psrlq xmmE,xmmH
- psllq xmmB,xmmC
- psllq xmmG,xmmC
- por xmmA,xmmB
- por xmmE,xmmG
-.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
- alignx 16,7
-
-.nextrow:
- pop ecx
- pop esi
- pop ebx
- pop edx
- pop edi
- pop eax
-
- add esi, byte SIZEOF_JSAMPROW
- add ebx, byte SIZEOF_JSAMPROW
- add edx, byte SIZEOF_JSAMPROW
- add edi, byte SIZEOF_JSAMPROW ; output_buf
- dec eax ; num_rows
- jg near .rowloop
-
- sfence ; flush the write buffer
-
-.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
-
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
+%include "jdclrss2.asm"
diff --git a/simd/jsimd.h b/simd/jsimd.h
index 371586e..a5b432f 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -21,11 +21,35 @@
#ifdef NEED_SHORT_EXTERNAL_NAMES
#define jpeg_simd_cpu_support jSiCpuSupport
#define jsimd_rgb_ycc_convert_mmx jSRGBYCCM
+#define jsimd_extrgb_ycc_convert_mmx jSEXTRGBYCCM
+#define jsimd_extrgbx_ycc_convert_mmx jSEXTRGBXYCCM
+#define jsimd_extbgr_ycc_convert_mmx jSEXTBGRYCCM
+#define jsimd_extbgrx_ycc_convert_mmx jSEXTBGRXYCCM
+#define jsimd_extxbgr_ycc_convert_mmx jSEXTXBGRYCCM
+#define jsimd_extxrgb_ycc_convert_mmx jSEXTXRGBYCCM
#define jsimd_ycc_rgb_convert_mmx jSYCCRGBM
+#define jsimd_ycc_extrgb_convert_mmx jSYCCEXTRGBM
+#define jsimd_ycc_extrgbx_convert_mmx jSYCCEXTRGBXM
+#define jsimd_ycc_extbgr_convert_mmx jSYCCEXTBGRM
+#define jsimd_ycc_extbgrx_convert_mmx jSYCCEXTBGRXM
+#define jsimd_ycc_extxbgr_convert_mmx jSYCCEXTXBGRM
+#define jsimd_ycc_extxrgb_convert_mmx jSYCCEXTXRGBM
#define jconst_rgb_ycc_convert_sse2 jSCRGBYCCS2
#define jsimd_rgb_ycc_convert_sse2 jSRGBYCCS2
+#define jsimd_extrgb_ycc_convert_sse2 jSEXTRGBYCCS2
+#define jsimd_extrgbx_ycc_convert_sse2 jSEXTRGBXYCCS2
+#define jsimd_extbgr_ycc_convert_sse2 jSEXTBGRYCCS2
+#define jsimd_extbgrx_ycc_convert_sse2 jSEXTBGRXYCCS2
+#define jsimd_extxbgr_ycc_convert_sse2 jSEXTXBGRYCCS2
+#define jsimd_extxrgb_ycc_convert_sse2 jSEXTXRGBYCCS2
#define jconst_ycc_rgb_convert_sse2 jSCYCCRGBS2
#define jsimd_ycc_rgb_convert_sse2 jSYCCRGBS2
+#define jsimd_ycc_extrgb_convert_sse2 jSYCCEXTRGBS2
+#define jsimd_ycc_extrgbx_convert_sse2 jSYCCEXTRGBXS2
+#define jsimd_ycc_extbgr_convert_sse2 jSYCCEXTBGRS2
+#define jsimd_ycc_extbgrx_convert_sse2 jSYCCEXTBGRXS2
+#define jsimd_ycc_extxbgr_convert_sse2 jSYCCEXTXBGRS2
+#define jsimd_ycc_extxrgb_convert_sse2 jSYCCEXTXRGBS2
#define jsimd_h2v2_downsample_mmx jSDnH2V2M
#define jsimd_h2v1_downsample_mmx jSDnH2V1M
#define jsimd_h2v2_downsample_sse2 jSDnH2V2S2
@@ -89,21 +113,119 @@ EXTERN(void) jsimd_rgb_ycc_convert_mmx
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_ycc_convert_mmx
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_ycc_convert_mmx
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_ycc_convert_mmx
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_ycc_convert_mmx
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_ycc_convert_mmx
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_ycc_convert_mmx
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+
EXTERN(void) jsimd_ycc_rgb_convert_mmx
JPP((JDIMENSION out_width,
JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgb_convert_mmx
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgbx_convert_mmx
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgr_convert_mmx
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgrx_convert_mmx
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxbgr_convert_mmx
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxrgb_convert_mmx
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
extern const int jconst_rgb_ycc_convert_sse2[];
EXTERN(void) jsimd_rgb_ycc_convert_sse2
JPP((JDIMENSION img_width,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_ycc_convert_sse2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_ycc_convert_sse2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_ycc_convert_sse2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_ycc_convert_sse2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_ycc_convert_sse2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_ycc_convert_sse2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+
extern const int jconst_ycc_rgb_convert_sse2[];
EXTERN(void) jsimd_ycc_rgb_convert_sse2
JPP((JDIMENSION out_width,
JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgb_convert_sse2
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgbx_convert_sse2
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgr_convert_sse2
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgrx_convert_sse2
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxbgr_convert_sse2
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxrgb_convert_sse2
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
/* SIMD Downsample */
EXTERN(void) jsimd_h2v2_downsample_mmx