Add MMX SIMD implementation of computationally intensive routines.

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@17 632fc199-4ca6-4c93-a231-07263d6284db
author: ossman_ <ossman_@632fc199-4ca6-4c93-a231-07263d6284db> 2009-03-09 13:25:30 +0000
committer: ossman_ <ossman_@632fc199-4ca6-4c93-a231-07263d6284db> 2009-03-09 13:25:30 +0000
commit: 65984f5ac9933a1fc74c22da22671156f9b9de15 (patch)
tree: 809d3bf09560d2e99d61f190ea2d16e6d3122b13 /simd
parent: ddd9d54d5e181e76197c787220f3bc46e206deba (diff)
17 files changed, 6140 insertions, 1 deletions
diff --git a/simd/jccolmmx.asm b/simd/jccolmmx.asm
new file mode 100644
index 0000000..701a327
--- /dev/null
+++ b/simd/jccolmmx.asm
@@ -0,0 +1,508 @@
+;
+; jccolmmx.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "simd/jsimdext.inc"
+%include "simd/jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_081	equ	 5329			; FIX(0.08131)
+F_0_114	equ	 7471			; FIX(0.11400)
+F_0_168	equ	11059			; FIX(0.16874)
+F_0_250	equ	16384			; FIX(0.25000)
+F_0_299	equ	19595			; FIX(0.29900)
+F_0_331	equ	21709			; FIX(0.33126)
+F_0_418	equ	27439			; FIX(0.41869)
+F_0_587	equ	38470			; FIX(0.58700)
+F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_rgb_ycc_convert_mmx)
+
+EXTN(jconst_rgb_ycc_convert_mmx):
+
+PW_F0299_F0337	times 2 dw  F_0_299, F_0_337
+PW_F0114_F0250	times 2 dw  F_0_114, F_0_250
+PW_MF016_MF033	times 2 dw -F_0_168,-F_0_331
+PW_MF008_MF041	times 2 dw -F_0_081,-F_0_418
+PD_ONEHALFM1_CJ	times 2 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF	times 2 dd  (1 << (SCALEBITS-1))
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width,
+;                           JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                           JDIMENSION output_row, int num_rows);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION img_width
+%define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
+%define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
+%define output_row(b)	(b)+20		; JDIMENSION output_row
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		8
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_rgb_ycc_convert_mmx)
+
+EXTN(jsimd_rgb_ycc_convert_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, JDIMENSION [img_width(eax)]	; num_cols
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	esi, JSAMPIMAGE [output_buf(eax)]
+	mov	ecx, JDIMENSION [output_row(eax)]
+	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	esi, JSAMPARRAY [input_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	pushpic	eax
+	push	edx
+	push	ebx
+	push	edi
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr0
+	mov	ebx, JSAMPROW [ebx]	; outptr1
+	mov	edx, JSAMPROW [edx]	; outptr2
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jae	short .columnloop
+	alignx	16,7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+	push	eax
+	push	edx
+	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
+	test	cl, SIZEOF_BYTE
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_BYTE
+	xor	eax,eax
+	mov	al, BYTE [esi+ecx]
+.column_ld2:
+	test	cl, SIZEOF_WORD
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_WORD
+	xor	edx,edx
+	mov	dx, WORD [esi+ecx]
+	shl	eax, WORD_BIT
+	or	eax,edx
+.column_ld4:
+	movd	mmA,eax
+	pop	edx
+	pop	eax
+	test	cl, SIZEOF_DWORD
+	jz	short .column_ld8
+	sub	ecx, byte SIZEOF_DWORD
+	movd	mmG, DWORD [esi+ecx]
+	psllq	mmA, DWORD_BIT
+	por	mmA,mmG
+.column_ld8:
+	test	cl, SIZEOF_MMWORD
+	jz	short .column_ld16
+	movq	mmG,mmA
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	mov	ecx, SIZEOF_MMWORD
+	jmp	short .rgb_ycc_cnv
+.column_ld16:
+	test	cl, 2*SIZEOF_MMWORD
+	mov	ecx, SIZEOF_MMWORD
+	jz	short .rgb_ycc_cnv
+	movq	mmF,mmA
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+	movq	mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+	; mmA=(00 10 20 01 11 21 02 12)
+	; mmG=(22 03 13 23 04 14 24 05)
+	; mmF=(15 25 06 16 26 07 17 27)
+
+	movq      mmD,mmA
+	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 10 20 01)
+	psrlq     mmD,4*BYTE_BIT	; mmD=(11 21 02 12 -- -- -- --)
+
+	punpckhbw mmA,mmG		; mmA=(00 04 10 14 20 24 01 05)
+	psllq     mmG,4*BYTE_BIT	; mmG=(-- -- -- -- 22 03 13 23)
+
+	punpcklbw mmD,mmF		; mmD=(11 15 21 25 02 06 12 16)
+	punpckhbw mmG,mmF		; mmG=(22 26 03 07 13 17 23 27)
+
+	movq      mmE,mmA
+	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 04 10 14)
+	psrlq     mmE,4*BYTE_BIT	; mmE=(20 24 01 05 -- -- -- --)
+
+	punpckhbw mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
+	psllq     mmD,4*BYTE_BIT	; mmD=(-- -- -- -- 11 15 21 25)
+
+	punpcklbw mmE,mmG		; mmE=(20 22 24 26 01 03 05 07)
+	punpckhbw mmD,mmG		; mmD=(11 13 15 17 21 23 25 27)
+
+	pxor      mmH,mmH
+
+	movq      mmC,mmA
+	punpcklbw mmA,mmH		; mmA=(00 02 04 06)
+	punpckhbw mmC,mmH		; mmC=(10 12 14 16)
+
+	movq      mmB,mmE
+	punpcklbw mmE,mmH		; mmE=(20 22 24 26)
+	punpckhbw mmB,mmH		; mmB=(01 03 05 07)
+
+	movq      mmF,mmD
+	punpcklbw mmD,mmH		; mmD=(11 13 15 17)
+	punpckhbw mmF,mmH		; mmF=(21 23 25 27)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+	test	cl, SIZEOF_MMWORD/8
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_MMWORD/8
+	movd	mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+	test	cl, SIZEOF_MMWORD/4
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_MMWORD/4
+	movq	mmF,mmA
+	movq	mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+	test	cl, SIZEOF_MMWORD/2
+	mov	ecx, SIZEOF_MMWORD
+	jz	short .rgb_ycc_cnv
+	movq	mmD,mmA
+	movq	mmC,mmF
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+	movq	mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+	movq	mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+	; mmA=(00 10 20 30 01 11 21 31)
+	; mmF=(02 12 22 32 03 13 23 33)
+	; mmD=(04 14 24 34 05 15 25 35)
+	; mmC=(06 16 26 36 07 17 27 37)
+
+	movq      mmB,mmA
+	punpcklbw mmA,mmF		; mmA=(00 02 10 12 20 22 30 32)
+	punpckhbw mmB,mmF		; mmB=(01 03 11 13 21 23 31 33)
+
+	movq      mmG,mmD
+	punpcklbw mmD,mmC		; mmD=(04 06 14 16 24 26 34 36)
+	punpckhbw mmG,mmC		; mmG=(05 07 15 17 25 27 35 37)
+
+	movq      mmE,mmA
+	punpcklwd mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
+	punpckhwd mmE,mmD		; mmE=(20 22 24 26 30 32 34 36)
+
+	movq      mmH,mmB
+	punpcklwd mmB,mmG		; mmB=(01 03 05 07 11 13 15 17)
+	punpckhwd mmH,mmG		; mmH=(21 23 25 27 31 33 35 37)
+
+	pxor      mmF,mmF
+
+	movq      mmC,mmA
+	punpcklbw mmA,mmF		; mmA=(00 02 04 06)
+	punpckhbw mmC,mmF		; mmC=(10 12 14 16)
+
+	movq      mmD,mmB
+	punpcklbw mmB,mmF		; mmB=(01 03 05 07)
+	punpckhbw mmD,mmF		; mmD=(11 13 15 17)
+
+	movq      mmG,mmE
+	punpcklbw mmE,mmF		; mmE=(20 22 24 26)
+	punpckhbw mmG,mmF		; mmG=(30 32 34 36)
+
+	punpcklbw mmF,mmH
+	punpckhbw mmH,mmH
+	psrlw     mmF,BYTE_BIT		; mmF=(21 23 25 27)
+	psrlw     mmH,BYTE_BIT		; mmH=(31 33 35 37)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+	; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+	; (Original)
+	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+	;
+	; (This implementation)
+	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+	movq      MMWORD [wk(0)], mm0	; wk(0)=RE
+	movq      MMWORD [wk(1)], mm1	; wk(1)=RO
+	movq      MMWORD [wk(2)], mm4	; wk(2)=BE
+	movq      MMWORD [wk(3)], mm5	; wk(3)=BO
+
+	movq      mm6,mm1
+	punpcklwd mm1,mm3
+	punpckhwd mm6,mm3
+	movq      mm7,mm1
+	movq      mm4,mm6
+	pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+	pmaddwd   mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+	pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+	movq      MMWORD [wk(4)], mm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+	movq      MMWORD [wk(5)], mm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+	pxor      mm1,mm1
+	pxor      mm6,mm6
+	punpcklwd mm1,mm5		; mm1=BOL
+	punpckhwd mm6,mm5		; mm6=BOH
+	psrld     mm1,1			; mm1=BOL*FIX(0.500)
+	psrld     mm6,1			; mm6=BOH*FIX(0.500)
+
+	movq      mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
+
+	paddd     mm7,mm1
+	paddd     mm4,mm6
+	paddd     mm7,mm5
+	paddd     mm4,mm5
+	psrld     mm7,SCALEBITS		; mm7=CbOL
+	psrld     mm4,SCALEBITS		; mm4=CbOH
+	packssdw  mm7,mm4		; mm7=CbO
+
+	movq      mm1, MMWORD [wk(2)]	; mm1=BE
+
+	movq      mm6,mm0
+	punpcklwd mm0,mm2
+	punpckhwd mm6,mm2
+	movq      mm5,mm0
+	movq      mm4,mm6
+	pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+	pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+	movq      MMWORD [wk(6)], mm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+	movq      MMWORD [wk(7)], mm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+	pxor      mm0,mm0
+	pxor      mm6,mm6
+	punpcklwd mm0,mm1		; mm0=BEL
+	punpckhwd mm6,mm1		; mm6=BEH
+	psrld     mm0,1			; mm0=BEL*FIX(0.500)
+	psrld     mm6,1			; mm6=BEH*FIX(0.500)
+
+	movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+	paddd     mm5,mm0
+	paddd     mm4,mm6
+	paddd     mm5,mm1
+	paddd     mm4,mm1
+	psrld     mm5,SCALEBITS		; mm5=CbEL
+	psrld     mm4,SCALEBITS		; mm4=CbEH
+	packssdw  mm5,mm4		; mm5=CbE
+
+	psllw     mm7,BYTE_BIT
+	por       mm5,mm7		; mm5=Cb
+	movq      MMWORD [ebx], mm5	; Save Cb
+
+	movq      mm0, MMWORD [wk(3)]	; mm0=BO
+	movq      mm6, MMWORD [wk(2)]	; mm6=BE
+	movq      mm1, MMWORD [wk(1)]	; mm1=RO
+
+	movq      mm4,mm0
+	punpcklwd mm0,mm3
+	punpckhwd mm4,mm3
+	movq      mm7,mm0
+	movq      mm5,mm4
+	pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+	pmaddwd   mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+	movq      mm3,[GOTOFF(eax,PD_ONEHALF)]	; mm3=[PD_ONEHALF]
+
+	paddd     mm0, MMWORD [wk(4)]
+	paddd     mm4, MMWORD [wk(5)]
+	paddd     mm0,mm3
+	paddd     mm4,mm3
+	psrld     mm0,SCALEBITS		; mm0=YOL
+	psrld     mm4,SCALEBITS		; mm4=YOH
+	packssdw  mm0,mm4		; mm0=YO
+
+	pxor      mm3,mm3
+	pxor      mm4,mm4
+	punpcklwd mm3,mm1		; mm3=ROL
+	punpckhwd mm4,mm1		; mm4=ROH
+	psrld     mm3,1			; mm3=ROL*FIX(0.500)
+	psrld     mm4,1			; mm4=ROH*FIX(0.500)
+
+	movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+	paddd     mm7,mm3
+	paddd     mm5,mm4
+	paddd     mm7,mm1
+	paddd     mm5,mm1
+	psrld     mm7,SCALEBITS		; mm7=CrOL
+	psrld     mm5,SCALEBITS		; mm5=CrOH
+	packssdw  mm7,mm5		; mm7=CrO
+
+	movq      mm3, MMWORD [wk(0)]	; mm3=RE
+
+	movq      mm4,mm6
+	punpcklwd mm6,mm2
+	punpckhwd mm4,mm2
+	movq      mm1,mm6
+	movq      mm5,mm4
+	pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+	pmaddwd   mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+	movq      mm2,[GOTOFF(eax,PD_ONEHALF)]	; mm2=[PD_ONEHALF]
+
+	paddd     mm6, MMWORD [wk(6)]
+	paddd     mm4, MMWORD [wk(7)]
+	paddd     mm6,mm2
+	paddd     mm4,mm2
+	psrld     mm6,SCALEBITS		; mm6=YEL
+	psrld     mm4,SCALEBITS		; mm4=YEH
+	packssdw  mm6,mm4		; mm6=YE
+
+	psllw     mm0,BYTE_BIT
+	por       mm6,mm0		; mm6=Y
+	movq      MMWORD [edi], mm6	; Save Y
+
+	pxor      mm2,mm2
+	pxor      mm4,mm4
+	punpcklwd mm2,mm3		; mm2=REL
+	punpckhwd mm4,mm3		; mm4=REH
+	psrld     mm2,1			; mm2=REL*FIX(0.500)
+	psrld     mm4,1			; mm4=REH*FIX(0.500)
+
+	movq      mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
+
+	paddd     mm1,mm2
+	paddd     mm5,mm4
+	paddd     mm1,mm0
+	paddd     mm5,mm0
+	psrld     mm1,SCALEBITS		; mm1=CrEL
+	psrld     mm5,SCALEBITS		; mm5=CrEH
+	packssdw  mm1,mm5		; mm1=CrE
+
+	psllw     mm7,BYTE_BIT
+	por       mm1,mm7		; mm1=Cr
+	movq      MMWORD [edx], mm1	; Save Cr
+
+	sub	ecx, byte SIZEOF_MMWORD
+	add	esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; inptr
+	add	edi, byte SIZEOF_MMWORD			; outptr0
+	add	ebx, byte SIZEOF_MMWORD			; outptr1
+	add	edx, byte SIZEOF_MMWORD			; outptr2
+	cmp	ecx, byte SIZEOF_MMWORD
+	jae	near .columnloop
+	test	ecx,ecx
+	jnz	near .column_ld1
+
+	pop	ecx			; col
+	pop	esi
+	pop	edi
+	pop	ebx
+	pop	edx
+	poppic	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_buf
+	add	edi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
diff --git a/simd/jcolsamp.inc b/simd/jcolsamp.inc
new file mode 100644
index 0000000..56b6bfd
--- /dev/null
+++ b/simd/jcolsamp.inc
@@ -0,0 +1,73 @@
+;
+; jcolsamp.inc - private declarations for color conversion & up/downsampling
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; [TAB8]
+
+; --------------------------------------------------------------------------
+
+; pseudo-resisters to make ordering of RGB configurable
+;
+%if RGB_RED == 0
+%define  mmA  mm0
+%define  mmB  mm1
+%elif RGB_GREEN == 0
+%define  mmA  mm2
+%define  mmB  mm3
+%elif RGB_BLUE == 0
+%define  mmA  mm4
+%define  mmB  mm5
+%else
+%define  mmA  mm6
+%define  mmB  mm7
+%endif
+
+%if RGB_RED == 1
+%define  mmC  mm0
+%define  mmD  mm1
+%elif RGB_GREEN == 1
+%define  mmC  mm2
+%define  mmD  mm3
+%elif RGB_BLUE == 1
+%define  mmC  mm4
+%define  mmD  mm5
+%else
+%define  mmC  mm6
+%define  mmD  mm7
+%endif
+
+%if RGB_RED == 2
+%define  mmE  mm0
+%define  mmF  mm1
+%elif RGB_GREEN == 2
+%define  mmE  mm2
+%define  mmF  mm3
+%elif RGB_BLUE == 2
+%define  mmE  mm4
+%define  mmF  mm5
+%else
+%define  mmE  mm6
+%define  mmF  mm7
+%endif
+
+%if RGB_RED == 3
+%define  mmG  mm0
+%define  mmH  mm1
+%elif RGB_GREEN == 3
+%define  mmG  mm2
+%define  mmH  mm3
+%elif RGB_BLUE == 3
+%define  mmG  mm4
+%define  mmH  mm5
+%else
+%define  mmG  mm6
+%define  mmH  mm7
+%endif
+
+; --------------------------------------------------------------------------
diff --git a/simd/jcqntmmx.asm b/simd/jcqntmmx.asm
new file mode 100644
index 0000000..6096eaa
--- /dev/null
+++ b/simd/jcqntmmx.asm
@@ -0,0 +1,271 @@
+;
+; jcqntmmx.asm - sample data conversion and quantization (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "simd/jsimdext.inc"
+%include "simd/jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_mmx)
+
+EXTN(jsimd_convsamp_mmx):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pxor	mm6,mm6			; mm6=(all 0's)
+	pcmpeqw	mm7,mm7
+	psllw	mm7,7			; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; mm0=(01234567)
+	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]	; mm1=(89ABCDEF)
+
+	mov	ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; mm2=(GHIJKLMN)
+	movq	mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE]	; mm3=(OPQRSTUV)
+
+	movq      mm4,mm0
+	punpcklbw mm0,mm6		; mm0=(0123)
+	punpckhbw mm4,mm6		; mm4=(4567)
+	movq      mm5,mm1
+	punpcklbw mm1,mm6		; mm1=(89AB)
+	punpckhbw mm5,mm6		; mm5=(CDEF)
+
+	paddw	mm0,mm7
+	paddw	mm4,mm7
+	paddw	mm1,mm7
+	paddw	mm5,mm7
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
+
+	movq      mm0,mm2
+	punpcklbw mm2,mm6		; mm2=(GHIJ)
+	punpckhbw mm0,mm6		; mm0=(KLMN)
+	movq      mm4,mm3
+	punpcklbw mm3,mm6		; mm3=(OPQR)
+	punpckhbw mm4,mm6		; mm4=(STUV)
+
+	paddw	mm2,mm7
+	paddw	mm0,mm7
+	paddw	mm3,mm7
+	paddw	mm4,mm7
+
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
+
+	add	esi, byte 4*SIZEOF_JSAMPROW
+	add	edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	short .convloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM * divisors,
+;                     DCTELEM * workspace);
+;
+
+%define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)      MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+%define SHIFT(m,n,b)      MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM)
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; DCTELEM * divisors
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_mmx)
+
+EXTN(jsimd_quantize_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	ah, 2
+	alignx	16,7
+.quantloop1:
+	mov	al, DCTSIZE2/8/2
+	alignx	16,7
+.quantloop2:
+	movq	mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
+
+	movq	mm0,mm2
+	movq	mm1,mm3
+
+	psraw	mm2,(WORD_BIT-1)  ; -1 if value < 0, 0 otherwise
+	psraw	mm3,(WORD_BIT-1)
+
+	pxor	mm0,mm2   ; val = -val
+	pxor	mm1,mm3
+	psubw	mm0,mm2
+	psubw	mm1,mm3
+
+	;
+	; MMX is an annoyingly crappy instruction set. It has two
+	; misfeatures that are causing problems here:
+	;
+	; - All multiplications are signed.
+	;
+	; - The second operand for the shifts is not treated as packed.
+	;
+	;
+	; We work around the first problem by implementing this algorithm:
+	;
+	; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
+	; {
+	;   enum { SHORT_BIT = 16 };
+	;   signed short sx = (signed short) x;
+	;   signed short sy = (signed short) y;
+	;   signed long sz;
+	; 
+	;   sz = (long) sx * (long) sy;     /* signed multiply */
+	; 
+	;   if (sx < 0) sz += (long) sy << SHORT_BIT;
+	;   if (sy < 0) sz += (long) sx << SHORT_BIT;
+	; 
+	;   return (unsigned long) sz;
+	; }
+	;
+	; (note that a negative sx adds _sy_ and vice versa)
+	;
+	; For the second problem, we replace the shift by a multiplication.
+	; Unfortunately that means we have to deal with the signed issue again.
+	;
+
+	paddw	mm0, MMWORD [CORRECTION(0,0,edx)]   ; correction + roundfactor
+	paddw	mm1, MMWORD [CORRECTION(0,1,edx)]
+
+	movq	mm4,mm0   ; store current value for later
+	movq	mm5,mm1
+	pmulhw	mm0, MMWORD [RECIPROCAL(0,0,edx)]   ; reciprocal
+	pmulhw	mm1, MMWORD [RECIPROCAL(0,1,edx)]
+	paddw	mm0,mm4		; reciprocal is always negative (MSB=1),
+	paddw	mm1,mm5   ; so we always need to add the initial value
+	                ; (input value is never negative as we
+	                ; inverted it at the start of this routine)
+
+	; here it gets a bit tricky as both scale
+	; and mm0/mm1 can be negative
+	movq	mm6, MMWORD [SCALE(0,0,edx)]	; scale
+	movq	mm7, MMWORD [SCALE(0,1,edx)]
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pmulhw	mm0,mm6
+	pmulhw	mm1,mm7
+
+	psraw	mm6,(WORD_BIT-1)    ; determine if scale is negative
+	psraw	mm7,(WORD_BIT-1)
+
+	pand	mm6,mm4             ; and add input if it is
+	pand	mm7,mm5
+	paddw	mm0,mm6
+	paddw	mm1,mm7
+
+	psraw	mm4,(WORD_BIT-1)    ; then check if negative input 
+	psraw	mm5,(WORD_BIT-1)
+
+	pand	mm4, MMWORD [SCALE(0,0,edx)]	; and add scale if it is
+	pand	mm5, MMWORD [SCALE(0,1,edx)]
+	paddw	mm0,mm4
+	paddw	mm1,mm5
+
+	pxor	mm0,mm2   ; val = -val
+	pxor	mm1,mm3
+	psubw	mm0,mm2
+	psubw	mm1,mm3
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
+
+	add	esi, byte 8*SIZEOF_DCTELEM
+	add	edx, byte 8*SIZEOF_DCTELEM
+	add	edi, byte 8*SIZEOF_JCOEF
+	dec	al
+	jnz	near .quantloop2
+	dec	ah
+	jnz	near .quantloop1	; to avoid branch misprediction
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
diff --git a/simd/jcsammmx.asm b/simd/jcsammmx.asm
new file mode 100644
index 0000000..5de3637
--- /dev/null
+++ b/simd/jcsammmx.asm
@@ -0,0 +1,321 @@
+;
+; jcsammmx.asm - downsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "simd/jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION image_width
+%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
+%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
+%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
+%define input_data(b)	(b)+24		; JSAMPARRAY input_data
+%define output_data(b)	(b)+28	; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jsimd_h2v1_downsample_mmx)
+
+EXTN(jsimd_h2v1_downsample_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, JDIMENSION [width_blks(ebp)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, JDIMENSION [img_width(ebp)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, INT [max_v_samp(ebp)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v1_downsample
+
+	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
+	test	eax,eax
+	jle	short .return
+
+	mov       edx, 0x00010000	; bias pattern
+	movd      mm7,edx
+	pcmpeqw   mm6,mm6
+	punpckldq mm7,mm7		; mm7={0, 1, 0, 1}
+	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]		; inptr
+	mov	edi, JSAMPROW [edi]		; outptr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mm1, MMWORD [esi+1*SIZEOF_MMWORD]
+	movq	mm2,mm0
+	movq	mm3,mm1
+
+	pand	mm0,mm6
+	psrlw	mm2,BYTE_BIT
+	pand	mm1,mm6
+	psrlw	mm3,BYTE_BIT
+
+	paddw	mm0,mm2
+	paddw	mm1,mm3
+	paddw	mm0,mm7
+	paddw	mm1,mm7
+	psrlw	mm0,1
+	psrlw	mm1,1
+
+	packuswb mm0,mm1
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr
+	add	edi, byte 1*SIZEOF_MMWORD	; outptr
+	sub	ecx, byte SIZEOF_MMWORD		; outcol
+	jnz	short .columnloop
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	short .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION image_width
+%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
+%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
+%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
+%define input_data(b)	(b)+24		; JSAMPARRAY input_data
+%define output_data(b)	(b)+28	; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jsimd_h2v2_downsample_mmx)
+
+EXTN(jsimd_h2v2_downsample_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, JDIMENSION [width_blks(ebp)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, JDIMENSION [img_width(ebp)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, INT [max_v_samp(ebp)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v2_downsample
+
+	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
+	test	eax,eax
+	jle	near .return
+
+	mov       edx, 0x00020001	; bias pattern
+	movd      mm7,edx
+	pcmpeqw   mm6,mm6
+	punpckldq mm7,mm7		; mm7={1, 2, 1, 2}
+	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1
+	mov	edi, JSAMPROW [edi]			; outptr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [edx+0*SIZEOF_MMWORD]
+	movq	mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mm2, MMWORD [edx+1*SIZEOF_MMWORD]
+	movq	mm3, MMWORD [esi+1*SIZEOF_MMWORD]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pand	mm0,mm6
+	psrlw	mm4,BYTE_BIT
+	pand	mm1,mm6
+	psrlw	mm5,BYTE_BIT
+	paddw	mm0,mm4
+	paddw	mm1,mm5
+
+	movq	mm4,mm2
+	movq	mm5,mm3
+	pand	mm2,mm6
+	psrlw	mm4,BYTE_BIT
+	pand	mm3,mm6
+	psrlw	mm5,BYTE_BIT
+	paddw	mm2,mm4
+	paddw	mm3,mm5
+
+	paddw	mm0,mm1
+	paddw	mm2,mm3
+	paddw	mm0,mm7
+	paddw	mm2,mm7
+	psrlw	mm0,2
+	psrlw	mm2,2
+
+	packuswb mm0,mm2
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+	add	edx, byte 2*SIZEOF_MMWORD	; inptr0
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr1
+	add	edi, byte 1*SIZEOF_MMWORD	; outptr
+	sub	ecx, byte SIZEOF_MMWORD		; outcol
+	jnz	near .columnloop
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte 2*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 1*SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
diff --git a/simd/jdcolmmx.asm b/simd/jdcolmmx.asm
new file mode 100644
index 0000000..33d5063
--- /dev/null
+++ b/simd/jdcolmmx.asm
@@ -0,0 +1,433 @@
+;
+; jdcolmmx.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "simd/jsimdext.inc"
+%include "simd/jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_ycc_rgb_convert_mmx)
+
+EXTN(jconst_ycc_rgb_convert_mmx):
+
+PW_F0402	times 4 dw  F_0_402
+PW_MF0228	times 4 dw -F_0_228
+PW_MF0344_F0285	times 2 dw -F_0_344, F_0_285
+PW_ONE		times 4 dw  1
+PD_ONEHALF	times 2 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width,
+;                            JSAMPIMAGE input_buf, JDIMENSION input_row,
+;                            JSAMPARRAY output_buf, int num_rows)
+;
+
+%define out_width(b)	(b)+8			; JDIMENSION out_width
+%define input_buf(b)	(b)+12		; JSAMPIMAGE input_buf
+%define input_row(b)	(b)+16		; JDIMENSION input_row
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_ycc_rgb_convert_mmx)
+
+EXTN(jsimd_ycc_rgb_convert_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, JDIMENSION [out_width(eax)]	; num_cols
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [input_row(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	push	eax
+	push	edi
+	push	edx
+	push	ebx
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr0
+	mov	ebx, JSAMPROW [ebx]	; inptr1
+	mov	edx, JSAMPROW [edx]	; inptr2
+	mov	edi, JSAMPROW [edi]	; outptr
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+	alignx	16,7
+.columnloop:
+
+	movq	mm5, MMWORD [ebx]	; mm5=Cb(01234567)
+	movq	mm1, MMWORD [edx]	; mm1=Cr(01234567)
+
+	pcmpeqw	mm4,mm4
+	pcmpeqw	mm7,mm7
+	psrlw	mm4,BYTE_BIT
+	psllw	mm7,7			; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+	movq	mm0,mm4			; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
+
+	pand	mm4,mm5			; mm4=Cb(0246)=CbE
+	psrlw	mm5,BYTE_BIT		; mm5=Cb(1357)=CbO
+	pand	mm0,mm1			; mm0=Cr(0246)=CrE
+	psrlw	mm1,BYTE_BIT		; mm1=Cr(1357)=CrO
+
+	paddw	mm4,mm7
+	paddw	mm5,mm7
+	paddw	mm0,mm7
+	paddw	mm1,mm7
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movq	mm2,mm4			; mm2=CbE
+	movq	mm3,mm5			; mm3=CbO
+	paddw	mm4,mm4			; mm4=2*CbE
+	paddw	mm5,mm5			; mm5=2*CbO
+	movq	mm6,mm0			; mm6=CrE
+	movq	mm7,mm1			; mm7=CrO
+	paddw	mm0,mm0			; mm0=2*CrE
+	paddw	mm1,mm1			; mm1=2*CrO
+
+	pmulhw	mm4,[GOTOFF(eax,PW_MF0228)]	; mm4=(2*CbE * -FIX(0.22800))
+	pmulhw	mm5,[GOTOFF(eax,PW_MF0228)]	; mm5=(2*CbO * -FIX(0.22800))
+	pmulhw	mm0,[GOTOFF(eax,PW_F0402)]	; mm0=(2*CrE * FIX(0.40200))
+	pmulhw	mm1,[GOTOFF(eax,PW_F0402)]	; mm1=(2*CrO * FIX(0.40200))
+
+	paddw	mm4,[GOTOFF(eax,PW_ONE)]
+	paddw	mm5,[GOTOFF(eax,PW_ONE)]
+	psraw	mm4,1			; mm4=(CbE * -FIX(0.22800))
+	psraw	mm5,1			; mm5=(CbO * -FIX(0.22800))
+	paddw	mm0,[GOTOFF(eax,PW_ONE)]
+	paddw	mm1,[GOTOFF(eax,PW_ONE)]
+	psraw	mm0,1			; mm0=(CrE * FIX(0.40200))
+	psraw	mm1,1			; mm1=(CrO * FIX(0.40200))
+
+	paddw	mm4,mm2
+	paddw	mm5,mm3
+	paddw	mm4,mm2			; mm4=(CbE * FIX(1.77200))=(B-Y)E
+	paddw	mm5,mm3			; mm5=(CbO * FIX(1.77200))=(B-Y)O
+	paddw	mm0,mm6			; mm0=(CrE * FIX(1.40200))=(R-Y)E
+	paddw	mm1,mm7			; mm1=(CrO * FIX(1.40200))=(R-Y)O
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(B-Y)E
+	movq	MMWORD [wk(1)], mm5	; wk(1)=(B-Y)O
+
+	movq      mm4,mm2
+	movq      mm5,mm3
+	punpcklwd mm2,mm6
+	punpckhwd mm4,mm6
+	pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm4,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd mm3,mm7
+	punpckhwd mm5,mm7
+	pmaddwd   mm3,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm4,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm2,SCALEBITS
+	psrad     mm4,SCALEBITS
+	paddd     mm3,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm3,SCALEBITS
+	psrad     mm5,SCALEBITS
+
+	packssdw  mm2,mm4	; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+	packssdw  mm3,mm5	; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+	psubw     mm2,mm6	; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+	psubw     mm3,mm7	; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+	movq      mm5, MMWORD [esi]	; mm5=Y(01234567)
+
+	pcmpeqw   mm4,mm4
+	psrlw     mm4,BYTE_BIT		; mm4={0xFF 0x00 0xFF 0x00 ..}
+	pand      mm4,mm5		; mm4=Y(0246)=YE
+	psrlw     mm5,BYTE_BIT		; mm5=Y(1357)=YO
+
+	paddw     mm0,mm4		; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
+	paddw     mm1,mm5		; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
+	packuswb  mm0,mm0		; mm0=(R0 R2 R4 R6 ** ** ** **)
+	packuswb  mm1,mm1		; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+	paddw     mm2,mm4		; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
+	paddw     mm3,mm5		; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
+	packuswb  mm2,mm2		; mm2=(G0 G2 G4 G6 ** ** ** **)
+	packuswb  mm3,mm3		; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+	paddw     mm4, MMWORD [wk(0)]	; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
+	paddw     mm5, MMWORD [wk(1)]	; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
+	packuswb  mm4,mm4		; mm4=(B0 B2 B4 B6 ** ** ** **)
+	packuswb  mm5,mm5		; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmB		; mmE=(20 01 22 03 24 05 26 07)
+	punpcklbw mmD,mmF		; mmD=(11 21 13 23 15 25 17 27)
+
+	movq      mmG,mmA
+	movq      mmH,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 01 02 12 22 03)
+	punpckhwd mmG,mmE		; mmG=(04 14 24 05 06 16 26 07)
+
+	psrlq     mmH,2*BYTE_BIT	; mmH=(02 12 04 14 06 16 -- --)
+	psrlq     mmE,2*BYTE_BIT	; mmE=(22 03 24 05 26 07 -- --)
+
+	movq      mmC,mmD
+	movq      mmB,mmD
+	punpcklwd mmD,mmH		; mmD=(11 21 02 12 13 23 04 14)
+	punpckhwd mmC,mmH		; mmC=(15 25 06 16 17 27 -- --)
+
+	psrlq     mmB,2*BYTE_BIT	; mmB=(13 23 15 25 17 27 -- --)
+
+	movq      mmF,mmE
+	punpcklwd mmE,mmB		; mmE=(22 03 13 23 24 05 15 25)
+	punpckhwd mmF,mmB		; mmF=(26 07 17 27 -- -- -- --)
+
+	punpckldq mmA,mmD		; mmA=(00 10 20 01 11 21 02 12)
+	punpckldq mmE,mmG		; mmE=(22 03 13 23 04 14 24 05)
+	punpckldq mmC,mmF		; mmC=(15 25 06 16 26 07 17 27)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	lea	ecx, [ecx+ecx*2]	; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_MMWORD
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	mmA,mmC
+	sub	ecx, byte 2*SIZEOF_MMWORD
+	add	edi, byte 2*SIZEOF_MMWORD
+	jmp	short .column_st4
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmE
+	sub	ecx, byte SIZEOF_MMWORD
+	add	edi, byte SIZEOF_MMWORD
+.column_st4:
+	movd	eax,mmA
+	cmp	ecx, byte SIZEOF_DWORD
+	jb	short .column_st2
+	mov	DWORD [edi+0*SIZEOF_DWORD], eax
+	psrlq	mmA,DWORD_BIT
+	movd	eax,mmA
+	sub	ecx, byte SIZEOF_DWORD
+	add	edi, byte SIZEOF_DWORD
+.column_st2:
+	cmp	ecx, byte SIZEOF_WORD
+	jb	short .column_st1
+	mov	WORD [edi+0*SIZEOF_WORD], ax
+	shr	eax,WORD_BIT
+	sub	ecx, byte SIZEOF_WORD
+	add	edi, byte SIZEOF_WORD
+.column_st1:
+	cmp	ecx, byte SIZEOF_BYTE
+	jb	short .nextrow
+	mov	BYTE [edi+0*SIZEOF_BYTE], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pcmpeqb   mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+	pxor      mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pxor      mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmG		; mmE=(20 30 22 32 24 34 26 36)
+	punpcklbw mmB,mmD		; mmB=(01 11 03 13 05 15 07 17)
+	punpcklbw mmF,mmH		; mmF=(21 31 23 33 25 35 27 37)
+
+	movq      mmC,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 30 02 12 22 32)
+	punpckhwd mmC,mmE		; mmC=(04 14 24 34 06 16 26 36)
+	movq      mmG,mmB
+	punpcklwd mmB,mmF		; mmB=(01 11 21 31 03 13 23 33)
+	punpckhwd mmG,mmF		; mmG=(05 15 25 35 07 17 27 37)
+
+	movq      mmD,mmA
+	punpckldq mmA,mmB		; mmA=(00 10 20 30 01 11 21 31)
+	punpckhdq mmD,mmB		; mmD=(02 12 22 32 03 13 23 33)
+	movq      mmH,mmC
+	punpckldq mmC,mmG		; mmC=(04 14 24 34 05 15 25 35)
+	punpckhdq mmH,mmG		; mmH=(06 16 26 36 07 17 27 37)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	cmp	ecx, byte SIZEOF_MMWORD/2
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	mmA,mmC
+	movq	mmD,mmH
+	sub	ecx, byte SIZEOF_MMWORD/2
+	add	edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD/4
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmD
+	sub	ecx, byte SIZEOF_MMWORD/4
+	add	edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+	cmp	ecx, byte SIZEOF_MMWORD/8
+	jb	short .nextrow
+	movd	DWORD [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	alignx	16,7
+
+.nextrow:
+	pop	ecx
+	pop	esi
+	pop	ebx
+	pop	edx
+	pop	edi
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	add	edi, byte SIZEOF_JSAMPROW	; output_buf
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
diff --git a/simd/jdct.inc b/simd/jdct.inc
new file mode 100644
index 0000000..e53f66e
--- /dev/null
+++ b/simd/jdct.inc
@@ -0,0 +1,27 @@
+;
+; jdct.inc - private declarations for forward & reverse DCT subsystems
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; [TAB8]
+
+; Each IDCT routine is responsible for range-limiting its results and
+; converting them to unsigned form (0..MAXJSAMPLE).  The raw outputs could
+; be quite far out of range if the input data is corrupt, so a bulletproof
+; range-limiting step is required.  We use a mask-and-table-lookup method
+; to do the combined operations quickly.
+;
+%define RANGE_MASK  (MAXJSAMPLE * 4 + 3)  ; 2 bits wider than legal samples
+
+%define ROW(n,b,s)		((b)+(n)*(s))
+%define COL(n,b,s)		((b)+(n)*(s)*DCTSIZE)
+
+%define DWBLOCK(m,n,b,s)	((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD)
+%define MMBLOCK(m,n,b,s)	((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD)
+
+; --------------------------------------------------------------------------
diff --git a/simd/jdmermmx.asm b/simd/jdmermmx.asm
new file mode 100644
index 0000000..8d82e40
--- /dev/null
+++ b/simd/jdmermmx.asm
@@ -0,0 +1,492 @@
+;
+; jdmermmx.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "simd/jsimdext.inc"
+%include "simd/jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_merged_upsample_mmx)
+
+EXTN(jconst_merged_upsample_mmx):
+
+PW_F0402	times 4 dw  F_0_402
+PW_MF0228	times 4 dw -F_0_228
+PW_MF0344_F0285	times 2 dw -F_0_344, F_0_285
+PW_ONE		times 4 dw  1
+PD_ONEHALF	times 2 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_mmx (JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)	(b)+8			; JDIMENSION output_width
+%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
+%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		3
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+EXTN(jsimd_h2v1_merged_upsample_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, JDIMENSION [output_width(eax)]	; col
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [in_row_group_ctr(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]		; inptr0
+	mov	ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]		; inptr1
+	mov	edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]		; inptr2
+	mov	edi, JSAMPROW [edi]				; outptr
+
+	pop	ecx			; col
+
+	alignx	16,7
+.columnloop:
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	movq      mm6, MMWORD [ebx]	; mm6=Cb(01234567)
+	movq      mm7, MMWORD [edx]	; mm7=Cr(01234567)
+
+	pxor      mm1,mm1		; mm1=(all 0's)
+	pcmpeqw   mm3,mm3
+	psllw     mm3,7			; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
+
+	movq      mm4,mm6
+	punpckhbw mm6,mm1		; mm6=Cb(4567)=CbH
+	punpcklbw mm4,mm1		; mm4=Cb(0123)=CbL
+	movq      mm0,mm7
+	punpckhbw mm7,mm1		; mm7=Cr(4567)=CrH
+	punpcklbw mm0,mm1		; mm0=Cr(0123)=CrL
+
+	paddw     mm6,mm3
+	paddw     mm4,mm3
+	paddw     mm7,mm3
+	paddw     mm0,mm3
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movq	mm5,mm6			; mm5=CbH
+	movq	mm2,mm4			; mm2=CbL
+	paddw	mm6,mm6			; mm6=2*CbH
+	paddw	mm4,mm4			; mm4=2*CbL
+	movq	mm1,mm7			; mm1=CrH
+	movq	mm3,mm0			; mm3=CrL
+	paddw	mm7,mm7			; mm7=2*CrH
+	paddw	mm0,mm0			; mm0=2*CrL
+
+	pmulhw	mm6,[GOTOFF(eax,PW_MF0228)]	; mm6=(2*CbH * -FIX(0.22800))
+	pmulhw	mm4,[GOTOFF(eax,PW_MF0228)]	; mm4=(2*CbL * -FIX(0.22800))
+	pmulhw	mm7,[GOTOFF(eax,PW_F0402)]	; mm7=(2*CrH * FIX(0.40200))
+	pmulhw	mm0,[GOTOFF(eax,PW_F0402)]	; mm0=(2*CrL * FIX(0.40200))
+
+	paddw	mm6,[GOTOFF(eax,PW_ONE)]
+	paddw	mm4,[GOTOFF(eax,PW_ONE)]
+	psraw	mm6,1			; mm6=(CbH * -FIX(0.22800))
+	psraw	mm4,1			; mm4=(CbL * -FIX(0.22800))
+	paddw	mm7,[GOTOFF(eax,PW_ONE)]
+	paddw	mm0,[GOTOFF(eax,PW_ONE)]
+	psraw	mm7,1			; mm7=(CrH * FIX(0.40200))
+	psraw	mm0,1			; mm0=(CrL * FIX(0.40200))
+
+	paddw	mm6,mm5
+	paddw	mm4,mm2
+	paddw	mm6,mm5			; mm6=(CbH * FIX(1.77200))=(B-Y)H
+	paddw	mm4,mm2			; mm4=(CbL * FIX(1.77200))=(B-Y)L
+	paddw	mm7,mm1			; mm7=(CrH * FIX(1.40200))=(R-Y)H
+	paddw	mm0,mm3			; mm0=(CrL * FIX(1.40200))=(R-Y)L
+
+	movq	MMWORD [wk(0)], mm6	; wk(0)=(B-Y)H
+	movq	MMWORD [wk(1)], mm7	; wk(1)=(R-Y)H
+
+	movq      mm6,mm5
+	movq      mm7,mm2
+	punpcklwd mm5,mm1
+	punpckhwd mm6,mm1
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm6,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd mm2,mm3
+	punpckhwd mm7,mm3
+	pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm7,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm6,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm5,SCALEBITS
+	psrad     mm6,SCALEBITS
+	paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm7,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm2,SCALEBITS
+	psrad     mm7,SCALEBITS
+
+	packssdw  mm5,mm6	; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+	packssdw  mm2,mm7	; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+	psubw     mm5,mm1	; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+	psubw     mm2,mm3	; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+	movq	MMWORD [wk(2)], mm5	; wk(2)=(G-Y)H
+
+	mov	al,2			; Yctr
+	jmp	short .Yloop_1st
+	alignx	16,7
+
+.Yloop_2nd:
+	movq	mm0, MMWORD [wk(1)]	; mm0=(R-Y)H
+	movq	mm2, MMWORD [wk(2)]	; mm2=(G-Y)H
+	movq	mm4, MMWORD [wk(0)]	; mm4=(B-Y)H
+	alignx	16,7
+
+.Yloop_1st:
+	movq	mm7, MMWORD [esi]	; mm7=Y(01234567)
+
+	pcmpeqw	mm6,mm6
+	psrlw	mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
+	pand	mm6,mm7			; mm6=Y(0246)=YE
+	psrlw	mm7,BYTE_BIT		; mm7=Y(1357)=YO
+
+	movq	mm1,mm0			; mm1=mm0=(R-Y)(L/H)
+	movq	mm3,mm2			; mm3=mm2=(G-Y)(L/H)
+	movq	mm5,mm4			; mm5=mm4=(B-Y)(L/H)
+
+	paddw     mm0,mm6		; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
+	paddw     mm1,mm7		; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
+	packuswb  mm0,mm0		; mm0=(R0 R2 R4 R6 ** ** ** **)
+	packuswb  mm1,mm1		; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+	paddw     mm2,mm6		; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
+	paddw     mm3,mm7		; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
+	packuswb  mm2,mm2		; mm2=(G0 G2 G4 G6 ** ** ** **)
+	packuswb  mm3,mm3		; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+	paddw     mm4,mm6		; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
+	paddw     mm5,mm7		; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
+	packuswb  mm4,mm4		; mm4=(B0 B2 B4 B6 ** ** ** **)
+	packuswb  mm5,mm5		; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmB		; mmE=(20 01 22 03 24 05 26 07)
+	punpcklbw mmD,mmF		; mmD=(11 21 13 23 15 25 17 27)
+
+	movq      mmG,mmA
+	movq      mmH,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 01 02 12 22 03)
+	punpckhwd mmG,mmE		; mmG=(04 14 24 05 06 16 26 07)
+
+	psrlq     mmH,2*BYTE_BIT	; mmH=(02 12 04 14 06 16 -- --)
+	psrlq     mmE,2*BYTE_BIT	; mmE=(22 03 24 05 26 07 -- --)
+
+	movq      mmC,mmD
+	movq      mmB,mmD
+	punpcklwd mmD,mmH		; mmD=(11 21 02 12 13 23 04 14)
+	punpckhwd mmC,mmH		; mmC=(15 25 06 16 17 27 -- --)
+
+	psrlq     mmB,2*BYTE_BIT	; mmB=(13 23 15 25 17 27 -- --)
+
+	movq      mmF,mmE
+	punpcklwd mmE,mmB		; mmE=(22 03 13 23 24 05 15 25)
+	punpckhwd mmF,mmB		; mmF=(26 07 17 27 -- -- -- --)
+
+	punpckldq mmA,mmD		; mmA=(00 10 20 01 11 21 02 12)
+	punpckldq mmE,mmG		; mmE=(22 03 13 23 04 14 24 05)
+	punpckldq mmC,mmF		; mmC=(15 25 06 16 26 07 17 27)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .endcolumn
+
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	dec	al			; Yctr
+	jnz	near .Yloop_2nd
+
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	lea	ecx, [ecx+ecx*2]	; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_MMWORD
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	mmA,mmC
+	sub	ecx, byte 2*SIZEOF_MMWORD
+	add	edi, byte 2*SIZEOF_MMWORD
+	jmp	short .column_st4
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmE
+	sub	ecx, byte SIZEOF_MMWORD
+	add	edi, byte SIZEOF_MMWORD
+.column_st4:
+	movd	eax,mmA
+	cmp	ecx, byte SIZEOF_DWORD
+	jb	short .column_st2
+	mov	DWORD [edi+0*SIZEOF_DWORD], eax
+	psrlq	mmA,DWORD_BIT
+	movd	eax,mmA
+	sub	ecx, byte SIZEOF_DWORD
+	add	edi, byte SIZEOF_DWORD
+.column_st2:
+	cmp	ecx, byte SIZEOF_WORD
+	jb	short .column_st1
+	mov	WORD [edi+0*SIZEOF_WORD], ax
+	shr	eax,WORD_BIT
+	sub	ecx, byte SIZEOF_WORD
+	add	edi, byte SIZEOF_WORD
+.column_st1:
+	cmp	ecx, byte SIZEOF_BYTE
+	jb	short .endcolumn
+	mov	BYTE [edi+0*SIZEOF_BYTE], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pcmpeqb   mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+	pxor      mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pxor      mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmG		; mmE=(20 30 22 32 24 34 26 36)
+	punpcklbw mmB,mmD		; mmB=(01 11 03 13 05 15 07 17)
+	punpcklbw mmF,mmH		; mmF=(21 31 23 33 25 35 27 37)
+
+	movq      mmC,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 30 02 12 22 32)
+	punpckhwd mmC,mmE		; mmC=(04 14 24 34 06 16 26 36)
+	movq      mmG,mmB
+	punpcklwd mmB,mmF		; mmB=(01 11 21 31 03 13 23 33)
+	punpckhwd mmG,mmF		; mmG=(05 15 25 35 07 17 27 37)
+
+	movq      mmD,mmA
+	punpckldq mmA,mmB		; mmA=(00 10 20 30 01 11 21 31)
+	punpckhdq mmD,mmB		; mmD=(02 12 22 32 03 13 23 33)
+	movq      mmH,mmC
+	punpckldq mmC,mmG		; mmC=(04 14 24 34 05 15 25 35)
+	punpckhdq mmH,mmG		; mmH=(06 16 26 36 07 17 27 37)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .endcolumn
+
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	dec	al			; Yctr
+	jnz	near .Yloop_2nd
+
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	cmp	ecx, byte SIZEOF_MMWORD/2
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	mmA,mmC
+	movq	mmD,mmH
+	sub	ecx, byte SIZEOF_MMWORD/2
+	add	edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD/4
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmD
+	sub	ecx, byte SIZEOF_MMWORD/4
+	add	edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+	cmp	ecx, byte SIZEOF_MMWORD/8
+	jb	short .endcolumn
+	movd	DWORD [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_mmx (JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)	(b)+8			; JDIMENSION output_width
+%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
+%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
+
+	align	16
+	global	EXTN(jsimd_h2v2_merged_upsample_mmx)
+
+EXTN(jsimd_h2v2_merged_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	eax, JDIMENSION [output_width(ebp)]
+
+	mov	edi, JSAMPIMAGE [input_buf(ebp)]
+	mov	ecx, JDIMENSION [in_row_group_ctr(ebp)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	mov	edi, JSAMPARRAY [output_buf(ebp)]
+	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+	push	edx			; inptr2
+	push	ebx			; inptr1
+	push	esi			; inptr00
+	mov	ebx,esp
+
+	push	edi			; output_buf (outptr0)
+	push	ecx			; in_row_group_ctr
+	push	ebx			; input_buf
+	push	eax			; output_width
+
+	call	near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+	add	esi, byte SIZEOF_JSAMPROW	; inptr01
+	add	edi, byte SIZEOF_JSAMPROW	; outptr1
+	mov	POINTER [ebx+0*SIZEOF_POINTER], esi
+	mov	POINTER [ebx-1*SIZEOF_POINTER], edi
+
+	call	near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+	add	esp, byte 7*SIZEOF_DWORD
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
diff --git a/simd/jdsammmx.asm b/simd/jdsammmx.asm
new file mode 100644
index 0000000..f061ee9
--- /dev/null
+++ b/simd/jdsammmx.asm
@@ -0,0 +1,734 @@
+;
+; jdsammmx.asm - upsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "simd/jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fancy_upsample_mmx)
+
+EXTN(jconst_fancy_upsample_mmx):
+
+PW_ONE		times 4 dw  1
+PW_TWO		times 4 dw  2
+PW_THREE	times 4 dw  3
+PW_SEVEN	times 4 dw  7
+PW_EIGHT	times 4 dw  8
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
+%define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jsimd_h2v1_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v1_fancy_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	mov	eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+	test	eax,eax
+	jz	near .return
+
+	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax			; colctr
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr
+
+	test	eax, SIZEOF_MMWORD-1
+	jz	short .skip
+	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+.skip:
+	pxor	mm0,mm0			; mm0=(all 0's)
+	pcmpeqb	mm7,mm7
+	psrlq	mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
+	pand	mm7, MMWORD [esi+0*SIZEOF_MMWORD]
+
+	add	eax, byte SIZEOF_MMWORD-1
+	and	eax, byte -SIZEOF_MMWORD
+	cmp	eax, byte SIZEOF_MMWORD
+	ja	short .columnloop
+	alignx	16,7
+
+.columnloop_last:
+	pcmpeqb	mm6,mm6
+	psllq	mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
+	pand	mm6, MMWORD [esi+0*SIZEOF_MMWORD]
+	jmp	short .upsample
+	alignx	16,7
+
+.columnloop:
+	movq	mm6, MMWORD [esi+1*SIZEOF_MMWORD]
+	psllq	mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
+
+.upsample:
+	movq	mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mm2,mm1
+	movq	mm3,mm1			; mm1=( 0 1 2 3 4 5 6 7)
+	psllq	mm2,BYTE_BIT		; mm2=( - 0 1 2 3 4 5 6)
+	psrlq	mm3,BYTE_BIT		; mm3=( 1 2 3 4 5 6 7 -)
+
+	por	mm2,mm7			; mm2=(-1 0 1 2 3 4 5 6)
+	por	mm3,mm6			; mm3=( 1 2 3 4 5 6 7 8)
+
+	movq	mm7,mm1
+	psrlq	mm7,(SIZEOF_MMWORD-1)*BYTE_BIT	; mm7=( 7 - - - - - - -)
+
+	movq      mm4,mm1
+	punpcklbw mm1,mm0		; mm1=( 0 1 2 3)
+	punpckhbw mm4,mm0		; mm4=( 4 5 6 7)
+	movq      mm5,mm2
+	punpcklbw mm2,mm0		; mm2=(-1 0 1 2)
+	punpckhbw mm5,mm0		; mm5=( 3 4 5 6)
+	movq      mm6,mm3
+	punpcklbw mm3,mm0		; mm3=( 1 2 3 4)
+	punpckhbw mm6,mm0		; mm6=( 5 6 7 8)
+
+	pmullw	mm1,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	mm2,[GOTOFF(ebx,PW_ONE)]
+	paddw	mm5,[GOTOFF(ebx,PW_ONE)]
+	paddw	mm3,[GOTOFF(ebx,PW_TWO)]
+	paddw	mm6,[GOTOFF(ebx,PW_TWO)]
+
+	paddw	mm2,mm1
+	paddw	mm5,mm4
+	psrlw	mm2,2			; mm2=OutLE=( 0  2  4  6)
+	psrlw	mm5,2			; mm5=OutHE=( 8 10 12 14)
+	paddw	mm3,mm1
+	paddw	mm6,mm4
+	psrlw	mm3,2			; mm3=OutLO=( 1  3  5  7)
+	psrlw	mm6,2			; mm6=OutHO=( 9 11 13 15)
+
+	psllw	mm3,BYTE_BIT
+	psllw	mm6,BYTE_BIT
+	por	mm2,mm3			; mm2=OutL=( 0  1  2  3  4  5  6  7)
+	por	mm5,mm6			; mm5=OutH=( 8  9 10 11 12 13 14 15)
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm5
+
+	sub	eax, byte SIZEOF_MMWORD
+	add	esi, byte 1*SIZEOF_MMWORD	; inptr
+	add	edi, byte 2*SIZEOF_MMWORD	; outptr
+	cmp	eax, byte SIZEOF_MMWORD
+	ja	near .columnloop
+	test	eax,eax
+	jnz	near .columnloop_last
+
+	pop	esi
+	pop	edi
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	ecx				; rowctr
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
+%define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		4
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_h2v2_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v2_fancy_upsample_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	edx,eax				; edx = original ebp
+	mov	eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+	test	eax,eax
+	jz	near .return
+
+	mov	ecx, INT [max_v_samp(edx)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(edx)]	; input_data
+	mov	edi, POINTER [output_data_ptr(edx)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax					; colctr
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+
+	test	eax, SIZEOF_MMWORD-1
+	jz	short .skip
+	push	edx
+	mov	dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+	pop	edx
+.skip:
+	; -- process the first column block
+
+	movq	mm0, MMWORD [ebx+0*SIZEOF_MMWORD]	; mm0=row[ 0][0]
+	movq	mm1, MMWORD [ecx+0*SIZEOF_MMWORD]	; mm1=row[-1][0]
+	movq	mm2, MMWORD [esi+0*SIZEOF_MMWORD]	; mm2=row[+1][0]
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pxor      mm3,mm3		; mm3=(all 0's)
+	movq      mm4,mm0
+	punpcklbw mm0,mm3		; mm0=row[ 0][0]( 0 1 2 3)
+	punpckhbw mm4,mm3		; mm4=row[ 0][0]( 4 5 6 7)
+	movq      mm5,mm1
+	punpcklbw mm1,mm3		; mm1=row[-1][0]( 0 1 2 3)
+	punpckhbw mm5,mm3		; mm5=row[-1][0]( 4 5 6 7)
+	movq      mm6,mm2
+	punpcklbw mm2,mm3		; mm2=row[+1][0]( 0 1 2 3)
+	punpckhbw mm6,mm3		; mm6=row[+1][0]( 4 5 6 7)
+
+	pmullw	mm0,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+
+	pcmpeqb	mm7,mm7
+	psrlq	mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
+
+	paddw	mm1,mm0			; mm1=Int0L=( 0 1 2 3)
+	paddw	mm5,mm4			; mm5=Int0H=( 4 5 6 7)
+	paddw	mm2,mm0			; mm2=Int1L=( 0 1 2 3)
+	paddw	mm6,mm4			; mm6=Int1H=( 4 5 6 7)
+
+	movq	MMWORD [edx+0*SIZEOF_MMWORD], mm1	; temporarily save
+	movq	MMWORD [edx+1*SIZEOF_MMWORD], mm5	; the intermediate data
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm6
+
+	pand	mm1,mm7			; mm1=( 0 - - -)
+	pand	mm2,mm7			; mm2=( 0 - - -)
+
+	movq	MMWORD [wk(0)], mm1
+	movq	MMWORD [wk(1)], mm2
+
+	poppic	ebx
+
+	add	eax, byte SIZEOF_MMWORD-1
+	and	eax, byte -SIZEOF_MMWORD
+	cmp	eax, byte SIZEOF_MMWORD
+	ja	short .columnloop
+	alignx	16,7
+
+.columnloop_last:
+	; -- process the last column block
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pcmpeqb	mm1,mm1
+	psllq	mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
+	movq	mm2,mm1
+
+	pand	mm1, MMWORD [edx+1*SIZEOF_MMWORD]	; mm1=( - - - 7)
+	pand	mm2, MMWORD [edi+1*SIZEOF_MMWORD]	; mm2=( - - - 7)
+
+	movq	MMWORD [wk(2)], mm1
+	movq	MMWORD [wk(3)], mm2
+
+	jmp	short .upsample
+	alignx	16,7
+
+.columnloop:
+	; -- process the next column block
+
+	movq	mm0, MMWORD [ebx+1*SIZEOF_MMWORD]	; mm0=row[ 0][1]
+	movq	mm1, MMWORD [ecx+1*SIZEOF_MMWORD]	; mm1=row[-1][1]
+	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]	; mm2=row[+1][1]
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pxor      mm3,mm3		; mm3=(all 0's)
+	movq      mm4,mm0
+	punpcklbw mm0,mm3		; mm0=row[ 0][1]( 0 1 2 3)
+	punpckhbw mm4,mm3		; mm4=row[ 0][1]( 4 5 6 7)
+	movq      mm5,mm1
+	punpcklbw mm1,mm3		; mm1=row[-1][1]( 0 1 2 3)
+	punpckhbw mm5,mm3		; mm5=row[-1][1]( 4 5 6 7)
+	movq      mm6,mm2
+	punpcklbw mm2,mm3		; mm2=row[+1][1]( 0 1 2 3)
+	punpckhbw mm6,mm3		; mm6=row[+1][1]( 4 5 6 7)
+
+	pmullw	mm0,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+
+	paddw	mm1,mm0			; mm1=Int0L=( 0 1 2 3)
+	paddw	mm5,mm4			; mm5=Int0H=( 4 5 6 7)
+	paddw	mm2,mm0			; mm2=Int1L=( 0 1 2 3)
+	paddw	mm6,mm4			; mm6=Int1H=( 4 5 6 7)
+
+	movq	MMWORD [edx+2*SIZEOF_MMWORD], mm1	; temporarily save
+	movq	MMWORD [edx+3*SIZEOF_MMWORD], mm5	; the intermediate data
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm6
+
+	psllq	mm1,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm1=( - - - 0)
+	psllq	mm2,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm2=( - - - 0)
+
+	movq	MMWORD [wk(2)], mm1
+	movq	MMWORD [wk(3)], mm2
+
+.upsample:
+	; -- process the upper row
+
+	movq	mm7, MMWORD [edx+0*SIZEOF_MMWORD]	; mm7=Int0L=( 0 1 2 3)
+	movq	mm3, MMWORD [edx+1*SIZEOF_MMWORD]	; mm3=Int0H=( 4 5 6 7)
+
+	movq	mm0,mm7
+	movq	mm4,mm3
+	psrlq	mm0,2*BYTE_BIT			; mm0=( 1 2 3 -)
+	psllq	mm4,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm4=( - - - 4)
+	movq	mm5,mm7
+	movq	mm6,mm3
+	psrlq	mm5,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm5=( 3 - - -)
+	psllq	mm6,2*BYTE_BIT			; mm6=( - 4 5 6)
+
+	por	mm0,mm4				; mm0=( 1 2 3 4)
+	por	mm5,mm6				; mm5=( 3 4 5 6)
+
+	movq	mm1,mm7
+	movq	mm2,mm3
+	psllq	mm1,2*BYTE_BIT			; mm1=( - 0 1 2)
+	psrlq	mm2,2*BYTE_BIT			; mm2=( 5 6 7 -)
+	movq	mm4,mm3
+	psrlq	mm4,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm4=( 7 - - -)
+
+	por	mm1, MMWORD [wk(0)]		; mm1=(-1 0 1 2)
+	por	mm2, MMWORD [wk(2)]		; mm2=( 5 6 7 8)
+
+	movq	MMWORD [wk(0)], mm4
+
+	pmullw	mm7,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm3,[GOTOFF(ebx,PW_THREE)]
+	paddw	mm1,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	mm5,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	mm0,[GOTOFF(ebx,PW_SEVEN)]
+	paddw	mm2,[GOTOFF(ebx,PW_SEVEN)]
+
+	paddw	mm1,mm7
+	paddw	mm5,mm3
+	psrlw	mm1,4			; mm1=Out0LE=( 0  2  4  6)
+	psrlw	mm5,4			; mm5=Out0HE=( 8 10 12 14)
+	paddw	mm0,mm7
+	paddw	mm2,mm3
+	psrlw	mm0,4			; mm0=Out0LO=( 1  3  5  7)
+	psrlw	mm2,4			; mm2=Out0HO=( 9 11 13 15)
+
+	psllw	mm0,BYTE_BIT
+	psllw	mm2,BYTE_BIT
+	por	mm1,mm0			; mm1=Out0L=( 0  1  2  3  4  5  6  7)
+	por	mm5,mm2			; mm5=Out0H=( 8  9 10 11 12 13 14 15)
+
+	movq	MMWORD [edx+0*SIZEOF_MMWORD], mm1
+	movq	MMWORD [edx+1*SIZEOF_MMWORD], mm5
+
+	; -- process the lower row
+
+	movq	mm6, MMWORD [edi+0*SIZEOF_MMWORD]	; mm6=Int1L=( 0 1 2 3)
+	movq	mm4, MMWORD [edi+1*SIZEOF_MMWORD]	; mm4=Int1H=( 4 5 6 7)
+
+	movq	mm7,mm6
+	movq	mm3,mm4
+	psrlq	mm7,2*BYTE_BIT			; mm7=( 1 2 3 -)
+	psllq	mm3,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm3=( - - - 4)
+	movq	mm0,mm6
+	movq	mm2,mm4
+	psrlq	mm0,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm0=( 3 - - -)
+	psllq	mm2,2*BYTE_BIT			; mm2=( - 4 5 6)
+
+	por	mm7,mm3				; mm7=( 1 2 3 4)
+	por	mm0,mm2				; mm0=( 3 4 5 6)
+
+	movq	mm1,mm6
+	movq	mm5,mm4
+	psllq	mm1,2*BYTE_BIT			; mm1=( - 0 1 2)
+	psrlq	mm5,2*BYTE_BIT			; mm5=( 5 6 7 -)
+	movq	mm3,mm4
+	psrlq	mm3,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm3=( 7 - - -)
+
+	por	mm1, MMWORD [wk(1)]		; mm1=(-1 0 1 2)
+	por	mm5, MMWORD [wk(3)]		; mm5=( 5 6 7 8)
+
+	movq	MMWORD [wk(1)], mm3
+
+	pmullw	mm6,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	mm1,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	mm0,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	mm7,[GOTOFF(ebx,PW_SEVEN)]
+	paddw	mm5,[GOTOFF(ebx,PW_SEVEN)]
+
+	paddw	mm1,mm6
+	paddw	mm0,mm4
+	psrlw	mm1,4			; mm1=Out1LE=( 0  2  4  6)
+	psrlw	mm0,4			; mm0=Out1HE=( 8 10 12 14)
+	paddw	mm7,mm6
+	paddw	mm5,mm4
+	psrlw	mm7,4			; mm7=Out1LO=( 1  3  5  7)
+	psrlw	mm5,4			; mm5=Out1HO=( 9 11 13 15)
+
+	psllw	mm7,BYTE_BIT
+	psllw	mm5,BYTE_BIT
+	por	mm1,mm7			; mm1=Out1L=( 0  1  2  3  4  5  6  7)
+	por	mm0,mm5			; mm0=Out1H=( 8  9 10 11 12 13 14 15)
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm1
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm0
+
+	poppic	ebx
+
+	sub	eax, byte SIZEOF_MMWORD
+	add	ecx, byte 1*SIZEOF_MMWORD	; inptr1(above)
+	add	ebx, byte 1*SIZEOF_MMWORD	; inptr0
+	add	esi, byte 1*SIZEOF_MMWORD	; inptr1(below)
+	add	edx, byte 2*SIZEOF_MMWORD	; outptr0
+	add	edi, byte 2*SIZEOF_MMWORD	; outptr1
+	cmp	eax, byte SIZEOF_MMWORD
+	ja	near .columnloop
+	test	eax,eax
+	jnz	near .columnloop_last
+
+	pop	esi
+	pop	edi
+	pop	ecx
+	pop	eax
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
+;                          JDIMENSION output_width,
+;                          JSAMPARRAY input_data,
+;                          JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
+%define output_width(b)	(b)+12		; JDIMENSION output_width
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jsimd_h2v1_upsample_mmx)
+
+EXTN(jsimd_h2v1_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	edx, JDIMENSION [output_width(ebp)]
+	add	edx, byte (2*SIZEOF_MMWORD)-1
+	and	edx, byte -(2*SIZEOF_MMWORD)
+	jz	short .return
+
+	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
+	test	ecx,ecx
+	jz	short .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]		; inptr
+	mov	edi, JSAMPROW [edi]		; outptr
+	mov	eax,edx				; colctr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+	movq      mm1,mm0
+	punpcklbw mm0,mm0
+	punpckhbw mm1,mm1
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+	sub	eax, byte 2*SIZEOF_MMWORD
+	jz	short .nextrow
+
+	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+	movq      mm3,mm2
+	punpcklbw mm2,mm2
+	punpckhbw mm3,mm3
+
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+	sub	eax, byte 2*SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr
+	add	edi, byte 4*SIZEOF_MMWORD	; outptr
+	jmp	short .columnloop
+	alignx	16,7
+
+.nextrow:
+	pop	esi
+	pop	edi
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	ecx				; rowctr
+	jg	short .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
+;                          JDIMENSION output_width,
+;                          JSAMPARRAY input_data,
+;                          JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
+%define output_width(b)	(b)+12		; JDIMENSION output_width
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jsimd_h2v2_upsample_mmx)
+
+EXTN(jsimd_h2v2_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	edx, JDIMENSION [output_width(ebp)]
+	add	edx, byte (2*SIZEOF_MMWORD)-1
+	and	edx, byte -(2*SIZEOF_MMWORD)
+	jz	near .return
+
+	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
+	test	ecx,ecx
+	jz	short .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]			; inptr
+	mov	ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+	mov	eax,edx					; colctr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+	movq      mm1,mm0
+	punpcklbw mm0,mm0
+	punpckhbw mm1,mm1
+
+	movq	MMWORD [ebx+0*SIZEOF_MMWORD], mm0
+	movq	MMWORD [ebx+1*SIZEOF_MMWORD], mm1
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+	sub	eax, byte 2*SIZEOF_MMWORD
+	jz	short .nextrow
+
+	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+	movq      mm3,mm2
+	punpcklbw mm2,mm2
+	punpckhbw mm3,mm3
+
+	movq	MMWORD [ebx+2*SIZEOF_MMWORD], mm2
+	movq	MMWORD [ebx+3*SIZEOF_MMWORD], mm3
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+	sub	eax, byte 2*SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr
+	add	ebx, byte 4*SIZEOF_MMWORD	; outptr0
+	add	edi, byte 4*SIZEOF_MMWORD	; outptr1
+	jmp	short .columnloop
+	alignx	16,7
+
+.nextrow:
+	pop	esi
+	pop	edi
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	short .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
diff --git a/simd/jfmmxfst.asm b/simd/jfmmxfst.asm
new file mode 100644
index 0000000..46556b1
--- /dev/null
+++ b/simd/jfmmxfst.asm
@@ -0,0 +1,394 @@
+;
+; jfmmxfst.asm - fast integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "simd/jsimdext.inc"
+%include "simd/jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382	equ	 98		; FIX(0.382683433)
+F_0_541	equ	139		; FIX(0.541196100)
+F_0_707	equ	181		; FIX(0.707106781)
+F_1_306	equ	334		; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_382	equ	DESCALE( 410903207,30-CONST_BITS)	; FIX(0.382683433)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_707	equ	DESCALE( 759250124,30-CONST_BITS)	; FIX(0.707106781)
+F_1_306	equ	DESCALE(1402911301,30-CONST_BITS)	; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_fdct_ifast_mmx)
+
+EXTN(jconst_fdct_ifast_mmx):
+
+PW_F0707	times 4 dw  F_0_707 << CONST_SHIFT
+PW_F0382	times 4 dw  F_0_382 << CONST_SHIFT
+PW_F0541	times 4 dw  F_0_541 << CONST_SHIFT
+PW_F1306	times 4 dw  F_1_306 << CONST_SHIFT
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_mmx (DCTELEM * data)
+;
+
+%define data(b)		(b)+8		; DCTELEM * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_fdct_ifast_mmx)
+
+EXTN(jsimd_fdct_ifast_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.rowloop:
+
+	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+	movq	mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+	; mm0=(20 21 22 23), mm2=(24 25 26 27)
+	; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+	movq      mm4,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm1		; mm0=(20 30 21 31)
+	punpckhwd mm4,mm1		; mm4=(22 32 23 33)
+	movq      mm5,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm3		; mm2=(24 34 25 35)
+	punpckhwd mm5,mm3		; mm5=(26 36 27 37)
+
+	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+	; mm6=(00 01 02 03), mm1=(04 05 06 07)
+	; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 32 23 33)
+	movq	MMWORD [wk(1)], mm2	; wk(1)=(24 34 25 35)
+
+	movq      mm4,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 10 01 11)
+	punpckhwd mm4,mm7		; mm4=(02 12 03 13)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm3		; mm1=(04 14 05 15)
+	punpckhwd mm2,mm3		; mm2=(06 16 07 17)
+
+	movq      mm7,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm0		; mm6=(00 10 20 30)=data0
+	punpckhdq mm7,mm0		; mm7=(01 11 21 31)=data1
+	movq      mm3,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm5		; mm2=(06 16 26 36)=data6
+	punpckhdq mm3,mm5		; mm3=(07 17 27 37)=data7
+
+	movq	mm0,mm7
+	movq	mm5,mm6
+	psubw	mm7,mm2			; mm7=data1-data6=tmp6
+	psubw	mm6,mm3			; mm6=data0-data7=tmp7
+	paddw	mm0,mm2			; mm0=data1+data6=tmp1
+	paddw	mm5,mm3			; mm5=data0+data7=tmp0
+
+	movq	mm2, MMWORD [wk(0)]	; mm2=(22 32 23 33)
+	movq	mm3, MMWORD [wk(1)]	; mm3=(24 34 25 35)
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
+
+	movq      mm7,mm4		; transpose coefficients(phase 2)
+	punpckldq mm4,mm2		; mm4=(02 12 22 32)=data2
+	punpckhdq mm7,mm2		; mm7=(03 13 23 33)=data3
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm3		; mm1=(04 14 24 34)=data4
+	punpckhdq mm6,mm3		; mm6=(05 15 25 35)=data5
+
+	movq	mm2,mm7
+	movq	mm3,mm4
+	paddw	mm7,mm1			; mm7=data3+data4=tmp3
+	paddw	mm4,mm6			; mm4=data2+data5=tmp2
+	psubw	mm2,mm1			; mm2=data3-data4=tmp4
+	psubw	mm3,mm6			; mm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm1,mm5
+	movq	mm6,mm0
+	psubw	mm5,mm7			; mm5=tmp13
+	psubw	mm0,mm4			; mm0=tmp12
+	paddw	mm1,mm7			; mm1=tmp10
+	paddw	mm6,mm4			; mm6=tmp11
+
+	paddw	mm0,mm5
+	psllw	mm0,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
+
+	movq	mm7,mm1
+	movq	mm4,mm5
+	psubw	mm1,mm6			; mm1=data4
+	psubw	mm5,mm0			; mm5=data6
+	paddw	mm7,mm6			; mm7=data0
+	paddw	mm4,mm0			; mm4=data2
+
+	movq	MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+	; -- Odd part
+
+	movq	mm6, MMWORD [wk(0)]	; mm6=tmp6
+	movq	mm0, MMWORD [wk(1)]	; mm0=tmp7
+
+	paddw	mm2,mm3			; mm2=tmp10
+	paddw	mm3,mm6			; mm3=tmp11
+	paddw	mm6,mm0			; mm6=tmp12, mm0=tmp7
+
+	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	mm6,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	mm3,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
+
+	movq	mm1,mm2			; mm1=tmp10
+	psubw	mm2,mm6
+	pmulhw	mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
+	pmulhw	mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+	pmulhw	mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+	paddw	mm1,mm2			; mm1=z2
+	paddw	mm6,mm2			; mm6=z4
+
+	movq	mm5,mm0
+	psubw	mm0,mm3			; mm0=z13
+	paddw	mm5,mm3			; mm5=z11
+
+	movq	mm7,mm0
+	movq	mm4,mm5
+	psubw	mm0,mm1			; mm0=data3
+	psubw	mm5,mm6			; mm5=data7
+	paddw	mm7,mm1			; mm7=data5
+	paddw	mm4,mm6			; mm4=data1
+
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+	add	edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+	movq	mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+	; mm0=(02 12 22 32), mm2=(42 52 62 72)
+	; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+	movq      mm4,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm1		; mm0=(02 03 12 13)
+	punpckhwd mm4,mm1		; mm4=(22 23 32 33)
+	movq      mm5,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm3		; mm2=(42 43 52 53)
+	punpckhwd mm5,mm3		; mm5=(62 63 72 73)
+
+	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+	; mm6=(00 10 20 30), mm1=(40 50 60 70)
+	; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 23 32 33)
+	movq	MMWORD [wk(1)], mm2	; wk(1)=(42 43 52 53)
+
+	movq      mm4,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 01 10 11)
+	punpckhwd mm4,mm7		; mm4=(20 21 30 31)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm3		; mm1=(40 41 50 51)
+	punpckhwd mm2,mm3		; mm2=(60 61 70 71)
+
+	movq      mm7,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm0		; mm6=(00 01 02 03)=data0
+	punpckhdq mm7,mm0		; mm7=(10 11 12 13)=data1
+	movq      mm3,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm5		; mm2=(60 61 62 63)=data6
+	punpckhdq mm3,mm5		; mm3=(70 71 72 73)=data7
+
+	movq	mm0,mm7
+	movq	mm5,mm6
+	psubw	mm7,mm2			; mm7=data1-data6=tmp6
+	psubw	mm6,mm3			; mm6=data0-data7=tmp7
+	paddw	mm0,mm2			; mm0=data1+data6=tmp1
+	paddw	mm5,mm3			; mm5=data0+data7=tmp0
+
+	movq	mm2, MMWORD [wk(0)]	; mm2=(22 23 32 33)
+	movq	mm3, MMWORD [wk(1)]	; mm3=(42 43 52 53)
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
+
+	movq      mm7,mm4		; transpose coefficients(phase 2)
+	punpckldq mm4,mm2		; mm4=(20 21 22 23)=data2
+	punpckhdq mm7,mm2		; mm7=(30 31 32 33)=data3
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm3		; mm1=(40 41 42 43)=data4
+	punpckhdq mm6,mm3		; mm6=(50 51 52 53)=data5
+
+	movq	mm2,mm7
+	movq	mm3,mm4
+	paddw	mm7,mm1			; mm7=data3+data4=tmp3
+	paddw	mm4,mm6			; mm4=data2+data5=tmp2
+	psubw	mm2,mm1			; mm2=data3-data4=tmp4
+	psubw	mm3,mm6			; mm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm1,mm5
+	movq	mm6,mm0
+	psubw	mm5,mm7			; mm5=tmp13
+	psubw	mm0,mm4			; mm0=tmp12
+	paddw	mm1,mm7			; mm1=tmp10
+	paddw	mm6,mm4			; mm6=tmp11
+
+	paddw	mm0,mm5
+	psllw	mm0,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
+
+	movq	mm7,mm1
+	movq	mm4,mm5
+	psubw	mm1,mm6			; mm1=data4
+	psubw	mm5,mm0			; mm5=data6
+	paddw	mm7,mm6			; mm7=data0
+	paddw	mm4,mm0			; mm4=data2
+
+	movq	MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+	; -- Odd part
+
+	movq	mm6, MMWORD [wk(0)]	; mm6=tmp6
+	movq	mm0, MMWORD [wk(1)]	; mm0=tmp7
+
+	paddw	mm2,mm3			; mm2=tmp10
+	paddw	mm3,mm6			; mm3=tmp11
+	paddw	mm6,mm0			; mm6=tmp12, mm0=tmp7
+
+	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	mm6,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	mm3,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
+
+	movq	mm1,mm2			; mm1=tmp10
+	psubw	mm2,mm6
+	pmulhw	mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
+	pmulhw	mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+	pmulhw	mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+	paddw	mm1,mm2			; mm1=z2
+	paddw	mm6,mm2			; mm6=z4
+
+	movq	mm5,mm0
+	psubw	mm0,mm3			; mm0=z13
+	paddw	mm5,mm3			; mm5=z11
+
+	movq	mm7,mm0
+	movq	mm4,mm5
+	psubw	mm0,mm1			; mm0=data3
+	psubw	mm5,mm6			; mm5=data7
+	paddw	mm7,mm1			; mm7=data5
+	paddw	mm4,mm6			; mm4=data1
+
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+	add	edx, byte 4*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	near .columnloop
+
+	emms		; empty MMX state
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
diff --git a/simd/jfmmxint.asm b/simd/jfmmxint.asm
new file mode 100644
index 0000000..87935a9
--- /dev/null
+++ b/simd/jfmmxint.asm
@@ -0,0 +1,619 @@
+;
+; jfmmxint.asm - accurate integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "simd/jsimdext.inc"
+%include "simd/jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fdct_islow_mmx)
+
+EXTN(jconst_fdct_islow_mmx):
+
+PW_F130_F054	times 2 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 2 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 2 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 2 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 2 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 2 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 2 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 2 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 2 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 2 dd  1 << (DESCALE_P2-1)
+PW_DESCALE_P2X	times 4 dw  1 << (PASS1_BITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_mmx (DCTELEM * data)
+;
+
+%define data(b)		(b)+8		; DCTELEM * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_fdct_islow_mmx)
+
+EXTN(jsimd_fdct_islow_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.rowloop:
+
+	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+	movq	mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+	; mm0=(20 21 22 23), mm2=(24 25 26 27)
+	; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+	movq      mm4,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm1		; mm0=(20 30 21 31)
+	punpckhwd mm4,mm1		; mm4=(22 32 23 33)
+	movq      mm5,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm3		; mm2=(24 34 25 35)
+	punpckhwd mm5,mm3		; mm5=(26 36 27 37)
+
+	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+	; mm6=(00 01 02 03), mm1=(04 05 06 07)
+	; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 32 23 33)
+	movq	MMWORD [wk(1)], mm2	; wk(1)=(24 34 25 35)
+
+	movq      mm4,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 10 01 11)
+	punpckhwd mm4,mm7		; mm4=(02 12 03 13)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm3		; mm1=(04 14 05 15)
+	punpckhwd mm2,mm3		; mm2=(06 16 07 17)
+
+	movq      mm7,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm0		; mm6=(00 10 20 30)=data0
+	punpckhdq mm7,mm0		; mm7=(01 11 21 31)=data1
+	movq      mm3,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm5		; mm2=(06 16 26 36)=data6
+	punpckhdq mm3,mm5		; mm3=(07 17 27 37)=data7
+
+	movq	mm0,mm7
+	movq	mm5,mm6
+	psubw	mm7,mm2			; mm7=data1-data6=tmp6
+	psubw	mm6,mm3			; mm6=data0-data7=tmp7
+	paddw	mm0,mm2			; mm0=data1+data6=tmp1
+	paddw	mm5,mm3			; mm5=data0+data7=tmp0
+
+	movq	mm2, MMWORD [wk(0)]	; mm2=(22 32 23 33)
+	movq	mm3, MMWORD [wk(1)]	; mm3=(24 34 25 35)
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
+
+	movq      mm7,mm4		; transpose coefficients(phase 2)
+	punpckldq mm4,mm2		; mm4=(02 12 22 32)=data2
+	punpckhdq mm7,mm2		; mm7=(03 13 23 33)=data3
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm3		; mm1=(04 14 24 34)=data4
+	punpckhdq mm6,mm3		; mm6=(05 15 25 35)=data5
+
+	movq	mm2,mm7
+	movq	mm3,mm4
+	paddw	mm7,mm1			; mm7=data3+data4=tmp3
+	paddw	mm4,mm6			; mm4=data2+data5=tmp2
+	psubw	mm2,mm1			; mm2=data3-data4=tmp4
+	psubw	mm3,mm6			; mm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm1,mm5
+	movq	mm6,mm0
+	paddw	mm5,mm7			; mm5=tmp10
+	paddw	mm0,mm4			; mm0=tmp11
+	psubw	mm1,mm7			; mm1=tmp13
+	psubw	mm6,mm4			; mm6=tmp12
+
+	movq	mm7,mm5
+	paddw	mm5,mm0			; mm5=tmp10+tmp11
+	psubw	mm7,mm0			; mm7=tmp10-tmp11
+
+	psllw	mm5,PASS1_BITS		; mm5=data0
+	psllw	mm7,PASS1_BITS		; mm7=data4
+
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movq      mm4,mm1		; mm1=tmp13
+	movq      mm0,mm1
+	punpcklwd mm4,mm6		; mm6=tmp12
+	punpckhwd mm0,mm6
+	movq      mm1,mm4
+	movq      mm6,mm0
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=data2L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]	; mm0=data2H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=data6L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]	; mm6=data6H
+
+	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm0,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm4,DESCALE_P1
+	psrad	mm0,DESCALE_P1
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm6,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm1,DESCALE_P1
+	psrad	mm6,DESCALE_P1
+
+	packssdw  mm4,mm0		; mm4=data2
+	packssdw  mm1,mm6		; mm1=data6
+
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
+
+	; -- Odd part
+
+	movq	mm5, MMWORD [wk(0)]	; mm5=tmp6
+	movq	mm7, MMWORD [wk(1)]	; mm7=tmp7
+
+	movq	mm0,mm2			; mm2=tmp4
+	movq	mm6,mm3			; mm3=tmp5
+	paddw	mm0,mm5			; mm0=z3
+	paddw	mm6,mm7			; mm6=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movq      mm4,mm0
+	movq      mm1,mm0
+	punpcklwd mm4,mm6
+	punpckhwd mm1,mm6
+	movq      mm0,mm4
+	movq      mm6,mm1
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]	; mm4=z3L
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]	; mm1=z3H
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]	; mm0=z4L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]	; mm6=z4H
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=z3L
+	movq	MMWORD [wk(1)], mm1	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movq      mm4,mm2
+	movq      mm1,mm2
+	punpcklwd mm4,mm7
+	punpckhwd mm1,mm7
+	movq      mm2,mm4
+	movq      mm7,mm1
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]	; mm4=tmp4L
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]	; mm1=tmp4H
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]	; mm2=tmp7L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]	; mm7=tmp7H
+
+	paddd	mm4, MMWORD [wk(0)]	; mm4=data7L
+	paddd	mm1, MMWORD [wk(1)]	; mm1=data7H
+	paddd	mm2,mm0			; mm2=data1L
+	paddd	mm7,mm6			; mm7=data1H
+
+	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm4,DESCALE_P1
+	psrad	mm1,DESCALE_P1
+	paddd	mm2,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm2,DESCALE_P1
+	psrad	mm7,DESCALE_P1
+
+	packssdw  mm4,mm1		; mm4=data7
+	packssdw  mm2,mm7		; mm2=data1
+
+	movq	MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+	movq      mm1,mm3
+	movq      mm7,mm3
+	punpcklwd mm1,mm5
+	punpckhwd mm7,mm5
+	movq      mm3,mm1
+	movq      mm5,mm7
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]	; mm1=tmp5L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]	; mm7=tmp5H
+	pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]	; mm3=tmp6L
+	pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]	; mm5=tmp6H
+
+	paddd	mm1,mm0			; mm1=data5L
+	paddd	mm7,mm6			; mm7=data5H
+	paddd	mm3, MMWORD [wk(0)]	; mm3=data3L
+	paddd	mm5, MMWORD [wk(1)]	; mm5=data3H
+
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm1,DESCALE_P1
+	psrad	mm7,DESCALE_P1
+	paddd	mm3,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm5,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm3,DESCALE_P1
+	psrad	mm5,DESCALE_P1
+
+	packssdw  mm1,mm7		; mm1=data5
+	packssdw  mm3,mm5		; mm3=data3
+
+	movq	MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+	add	edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+	movq	mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+	; mm0=(02 12 22 32), mm2=(42 52 62 72)
+	; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+	movq      mm4,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm1		; mm0=(02 03 12 13)
+	punpckhwd mm4,mm1		; mm4=(22 23 32 33)
+	movq      mm5,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm3		; mm2=(42 43 52 53)
+	punpckhwd mm5,mm3		; mm5=(62 63 72 73)
+
+	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+	; mm6=(00 10 20 30), mm1=(40 50 60 70)
+	; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 23 32 33)
+	movq	MMWORD [wk(1)], mm2	; wk(1)=(42 43 52 53)
+
+	movq      mm4,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 01 10 11)
+	punpckhwd mm4,mm7		; mm4=(20 21 30 31)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm3		; mm1=(40 41 50 51)
+	punpckhwd mm2,mm3		; mm2=(60 61 70 71)
+
+	movq      mm7,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm0		; mm6=(00 01 02 03)=data0
+	punpckhdq mm7,mm0		; mm7=(10 11 12 13)=data1
+	movq      mm3,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm5		; mm2=(60 61 62 63)=data6
+	punpckhdq mm3,mm5		; mm3=(70 71 72 73)=data7
+
+	movq	mm0,mm7
+	movq	mm5,mm6
+	psubw	mm7,mm2			; mm7=data1-data6=tmp6
+	psubw	mm6,mm3			; mm6=data0-data7=tmp7
+	paddw	mm0,mm2			; mm0=data1+data6=tmp1
+	paddw	mm5,mm3			; mm5=data0+data7=tmp0
+
+	movq	mm2, MMWORD [wk(0)]	; mm2=(22 23 32 33)
+	movq	mm3, MMWORD [wk(1)]	; mm3=(42 43 52 53)
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
+
+	movq      mm7,mm4		; transpose coefficients(phase 2)
+	punpckldq mm4,mm2		; mm4=(20 21 22 23)=data2
+	punpckhdq mm7,mm2		; mm7=(30 31 32 33)=data3
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm3		; mm1=(40 41 42 43)=data4
+	punpckhdq mm6,mm3		; mm6=(50 51 52 53)=data5
+
+	movq	mm2,mm7
+	movq	mm3,mm4
+	paddw	mm7,mm1			; mm7=data3+data4=tmp3
+	paddw	mm4,mm6			; mm4=data2+data5=tmp2
+	psubw	mm2,mm1			; mm2=data3-data4=tmp4
+	psubw	mm3,mm6			; mm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm1,mm5
+	movq	mm6,mm0
+	paddw	mm5,mm7			; mm5=tmp10
+	paddw	mm0,mm4			; mm0=tmp11
+	psubw	mm1,mm7			; mm1=tmp13
+	psubw	mm6,mm4			; mm6=tmp12
+
+	movq	mm7,mm5
+	paddw	mm5,mm0			; mm5=tmp10+tmp11
+	psubw	mm7,mm0			; mm7=tmp10-tmp11
+
+	paddw	mm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
+	paddw	mm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
+	psraw	mm5,PASS1_BITS		; mm5=data0
+	psraw	mm7,PASS1_BITS		; mm7=data4
+
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movq      mm4,mm1		; mm1=tmp13
+	movq      mm0,mm1
+	punpcklwd mm4,mm6		; mm6=tmp12
+	punpckhwd mm0,mm6
+	movq      mm1,mm4
+	movq      mm6,mm0
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=data2L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]	; mm0=data2H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=data6L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]	; mm6=data6H
+
+	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm0,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm4,DESCALE_P2
+	psrad	mm0,DESCALE_P2
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm6,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm1,DESCALE_P2
+	psrad	mm6,DESCALE_P2
+
+	packssdw  mm4,mm0		; mm4=data2
+	packssdw  mm1,mm6		; mm1=data6
+
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
+
+	; -- Odd part
+
+	movq	mm5, MMWORD [wk(0)]	; mm5=tmp6
+	movq	mm7, MMWORD [wk(1)]	; mm7=tmp7
+
+	movq	mm0,mm2			; mm2=tmp4
+	movq	mm6,mm3			; mm3=tmp5
+	paddw	mm0,mm5			; mm0=z3
+	paddw	mm6,mm7			; mm6=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movq      mm4,mm0
+	movq      mm1,mm0
+	punpcklwd mm4,mm6
+	punpckhwd mm1,mm6
+	movq      mm0,mm4
+	movq      mm6,mm1
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]	; mm4=z3L
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]	; mm1=z3H
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]	; mm0=z4L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]	; mm6=z4H
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=z3L
+	movq	MMWORD [wk(1)], mm1	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movq      mm4,mm2
+	movq      mm1,mm2
+	punpcklwd mm4,mm7
+	punpckhwd mm1,mm7
+	movq      mm2,mm4
+	movq      mm7,mm1
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]	; mm4=tmp4L
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]	; mm1=tmp4H
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]	; mm2=tmp7L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]	; mm7=tmp7H
+
+	paddd	mm4, MMWORD [wk(0)]	; mm4=data7L
+	paddd	mm1, MMWORD [wk(1)]	; mm1=data7H
+	paddd	mm2,mm0			; mm2=data1L
+	paddd	mm7,mm6			; mm7=data1H
+
+	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm4,DESCALE_P2
+	psrad	mm1,DESCALE_P2
+	paddd	mm2,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm2,DESCALE_P2
+	psrad	mm7,DESCALE_P2
+
+	packssdw  mm4,mm1		; mm4=data7
+	packssdw  mm2,mm7		; mm2=data1
+
+	movq	MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+	movq      mm1,mm3
+	movq      mm7,mm3
+	punpcklwd mm1,mm5
+	punpckhwd mm7,mm5
+	movq      mm3,mm1
+	movq      mm5,mm7
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]	; mm1=tmp5L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]	; mm7=tmp5H
+	pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]	; mm3=tmp6L
+	pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]	; mm5=tmp6H
+
+	paddd	mm1,mm0			; mm1=data5L
+	paddd	mm7,mm6			; mm7=data5H
+	paddd	mm3, MMWORD [wk(0)]	; mm3=data3L
+	paddd	mm5, MMWORD [wk(1)]	; mm5=data3H
+
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm1,DESCALE_P2
+	psrad	mm7,DESCALE_P2
+	paddd	mm3,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm5,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm3,DESCALE_P2
+	psrad	mm5,DESCALE_P2
+
+	packssdw  mm1,mm7		; mm1=data5
+	packssdw  mm3,mm5		; mm3=data3
+
+	movq	MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+	add	edx, byte 4*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	near .columnloop
+
+	emms		; empty MMX state
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
diff --git a/simd/jimmxfst.asm b/simd/jimmxfst.asm
new file mode 100644
index 0000000..662a522
--- /dev/null
+++ b/simd/jimmxfst.asm
@@ -0,0 +1,497 @@
+;
+; jimmxfst.asm - fast integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "simd/jsimdext.inc"
+%include "simd/jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+%define PASS1_BITS	2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082	equ	277		; FIX(1.082392200)
+F_1_414	equ	362		; FIX(1.414213562)
+F_1_847	equ	473		; FIX(1.847759065)
+F_2_613	equ	669		; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - 256)	; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define	DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_1_082	equ	DESCALE(1162209775,30-CONST_BITS)	; FIX(1.082392200)
+F_1_414	equ	DESCALE(1518500249,30-CONST_BITS)	; FIX(1.414213562)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_613	equ	DESCALE(2805822602,30-CONST_BITS)	; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - (1 << CONST_BITS))	; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_idct_ifast_mmx)
+
+EXTN(jconst_idct_ifast_mmx):
+
+PW_F1414	times 4 dw  F_1_414 << CONST_SHIFT
+PW_F1847	times 4 dw  F_1_847 << CONST_SHIFT
+PW_MF1613	times 4 dw -F_1_613 << CONST_SHIFT
+PW_F1082	times 4 dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_mmx (void * dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; jpeg_component_info * compptr
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
+					; JCOEF workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jsimd_idct_ifast_mmx)
+
+EXTN(jsimd_idct_ifast_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [dct_table(eax)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; JCOEF * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	mm1,mm0
+	packsswb mm1,mm1
+	movd	eax,mm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
+	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
+	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
+
+	movq      mm1,mm0
+	punpckldq mm0,mm0		; mm0=(00 00 00 00)
+	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
+	movq      mm3,mm2
+	punpckldq mm2,mm2		; mm2=(02 02 02 02)
+	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	psubw	mm0,mm2			; mm0=tmp11
+	psubw	mm1,mm3
+	paddw	mm4,mm2			; mm4=tmp10
+	paddw	mm5,mm3			; mm5=tmp13
+
+	psllw	mm1,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm1,[GOTOFF(ebx,PW_F1414)]
+	psubw	mm1,mm5			; mm1=tmp12
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	psubw	mm4,mm5			; mm4=tmp3
+	psubw	mm0,mm1			; mm0=tmp2
+	paddw	mm6,mm5			; mm6=tmp0
+	paddw	mm7,mm1			; mm7=tmp1
+
+	movq	MMWORD [wk(1)], mm4	; wk(1)=tmp3
+	movq	MMWORD [wk(0)], mm0	; wk(0)=tmp2
+
+	; -- Odd part
+
+	movq	mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movq	mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movq	mm4,mm2
+	movq	mm0,mm5
+	psubw	mm2,mm1			; mm2=z12
+	psubw	mm5,mm3			; mm5=z10
+	paddw	mm4,mm1			; mm4=z11
+	paddw	mm0,mm3			; mm0=z13
+
+	movq	mm1,mm5			; mm1=z10(unscaled)
+	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	mm5,PRE_MULTIPLY_SCALE_BITS
+
+	movq	mm3,mm4
+	psubw	mm4,mm0
+	paddw	mm3,mm0			; mm3=tmp7
+
+	psllw	mm4,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm4,[GOTOFF(ebx,PW_F1414)]	; mm4=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movq	mm0,mm5
+	paddw	mm5,mm2
+	pmulhw	mm5,[GOTOFF(ebx,PW_F1847)]	; mm5=z5
+	pmulhw	mm0,[GOTOFF(ebx,PW_MF1613)]
+	pmulhw	mm2,[GOTOFF(ebx,PW_F1082)]
+	psubw	mm0,mm1
+	psubw	mm2,mm5			; mm2=tmp10
+	paddw	mm0,mm5			; mm0=tmp12
+
+	; -- Final output stage
+
+	psubw	mm0,mm3			; mm0=tmp6
+	movq	mm1,mm6
+	movq	mm5,mm7
+	paddw	mm6,mm3			; mm6=data0=(00 01 02 03)
+	paddw	mm7,mm0			; mm7=data1=(10 11 12 13)
+	psubw	mm1,mm3			; mm1=data7=(70 71 72 73)
+	psubw	mm5,mm0			; mm5=data6=(60 61 62 63)
+	psubw	mm4,mm0			; mm4=tmp5
+
+	movq      mm3,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 10 01 11)
+	punpckhwd mm3,mm7		; mm3=(02 12 03 13)
+	movq      mm0,mm5		; transpose coefficients(phase 1)
+	punpcklwd mm5,mm1		; mm5=(60 70 61 71)
+	punpckhwd mm0,mm1		; mm0=(62 72 63 73)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=tmp2
+	movq	mm1, MMWORD [wk(1)]	; mm1=tmp3
+
+	movq	MMWORD [wk(0)], mm5	; wk(0)=(60 70 61 71)
+	movq	MMWORD [wk(1)], mm0	; wk(1)=(62 72 63 73)
+
+	paddw	mm2,mm4			; mm2=tmp4
+	movq	mm5,mm7
+	movq	mm0,mm1
+	paddw	mm7,mm4			; mm7=data2=(20 21 22 23)
+	paddw	mm1,mm2			; mm1=data4=(40 41 42 43)
+	psubw	mm5,mm4			; mm5=data5=(50 51 52 53)
+	psubw	mm0,mm2			; mm0=data3=(30 31 32 33)
+
+	movq      mm4,mm7		; transpose coefficients(phase 1)
+	punpcklwd mm7,mm0		; mm7=(20 30 21 31)
+	punpckhwd mm4,mm0		; mm4=(22 32 23 33)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm5		; mm1=(40 50 41 51)
+	punpckhwd mm2,mm5		; mm2=(42 52 43 53)
+
+	movq      mm0,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm7		; mm6=(00 10 20 30)
+	punpckhdq mm0,mm7		; mm0=(01 11 21 31)
+	movq      mm5,mm3		; transpose coefficients(phase 2)
+	punpckldq mm3,mm4		; mm3=(02 12 22 32)
+	punpckhdq mm5,mm4		; mm5=(03 13 23 33)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=(60 70 61 71)
+	movq	mm4, MMWORD [wk(1)]	; mm4=(62 72 63 73)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm7		; mm1=(40 50 60 70)
+	punpckhdq mm6,mm7		; mm6=(41 51 61 71)
+	movq      mm0,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm4		; mm2=(42 52 62 72)
+	punpckhdq mm0,mm4		; mm0=(43 53 63 73)
+
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_IFAST_MULT_TYPE	; quantptr
+	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; JCOEF * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	psubw	mm0,mm2			; mm0=tmp11
+	psubw	mm1,mm3
+	paddw	mm4,mm2			; mm4=tmp10
+	paddw	mm5,mm3			; mm5=tmp13
+
+	psllw	mm1,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm1,[GOTOFF(ebx,PW_F1414)]
+	psubw	mm1,mm5			; mm1=tmp12
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	psubw	mm4,mm5			; mm4=tmp3
+	psubw	mm0,mm1			; mm0=tmp2
+	paddw	mm6,mm5			; mm6=tmp0
+	paddw	mm7,mm1			; mm7=tmp1
+
+	movq	MMWORD [wk(1)], mm4	; wk(1)=tmp3
+	movq	MMWORD [wk(0)], mm0	; wk(0)=tmp2
+
+	; -- Odd part
+
+	movq	mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	movq	mm4,mm2
+	movq	mm0,mm5
+	psubw	mm2,mm1			; mm2=z12
+	psubw	mm5,mm3			; mm5=z10
+	paddw	mm4,mm1			; mm4=z11
+	paddw	mm0,mm3			; mm0=z13
+
+	movq	mm1,mm5			; mm1=z10(unscaled)
+	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	mm5,PRE_MULTIPLY_SCALE_BITS
+
+	movq	mm3,mm4
+	psubw	mm4,mm0
+	paddw	mm3,mm0			; mm3=tmp7
+
+	psllw	mm4,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm4,[GOTOFF(ebx,PW_F1414)]	; mm4=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movq	mm0,mm5
+	paddw	mm5,mm2
+	pmulhw	mm5,[GOTOFF(ebx,PW_F1847)]	; mm5=z5
+	pmulhw	mm0,[GOTOFF(ebx,PW_MF1613)]
+	pmulhw	mm2,[GOTOFF(ebx,PW_F1082)]
+	psubw	mm0,mm1
+	psubw	mm2,mm5			; mm2=tmp10
+	paddw	mm0,mm5			; mm0=tmp12
+
+	; -- Final output stage
+
+	psubw	mm0,mm3			; mm0=tmp6
+	movq	mm1,mm6
+	movq	mm5,mm7
+	paddw	mm6,mm3			; mm6=data0=(00 10 20 30)
+	paddw	mm7,mm0			; mm7=data1=(01 11 21 31)
+	psraw	mm6,(PASS1_BITS+3)	; descale
+	psraw	mm7,(PASS1_BITS+3)	; descale
+	psubw	mm1,mm3			; mm1=data7=(07 17 27 37)
+	psubw	mm5,mm0			; mm5=data6=(06 16 26 36)
+	psraw	mm1,(PASS1_BITS+3)	; descale
+	psraw	mm5,(PASS1_BITS+3)	; descale
+	psubw	mm4,mm0			; mm4=tmp5
+
+	packsswb  mm6,mm5		; mm6=(00 10 20 30 06 16 26 36)
+	packsswb  mm7,mm1		; mm7=(01 11 21 31 07 17 27 37)
+
+	movq	mm3, MMWORD [wk(0)]	; mm3=tmp2
+	movq	mm0, MMWORD [wk(1)]	; mm0=tmp3
+
+	paddw	mm2,mm4			; mm2=tmp4
+	movq	mm5,mm3
+	movq	mm1,mm0
+	paddw	mm3,mm4			; mm3=data2=(02 12 22 32)
+	paddw	mm0,mm2			; mm0=data4=(04 14 24 34)
+	psraw	mm3,(PASS1_BITS+3)	; descale
+	psraw	mm0,(PASS1_BITS+3)	; descale
+	psubw	mm5,mm4			; mm5=data5=(05 15 25 35)
+	psubw	mm1,mm2			; mm1=data3=(03 13 23 33)
+	psraw	mm5,(PASS1_BITS+3)	; descale
+	psraw	mm1,(PASS1_BITS+3)	; descale
+
+	movq      mm4,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm4=[PB_CENTERJSAMP]
+
+	packsswb  mm3,mm0		; mm3=(02 12 22 32 04 14 24 34)
+	packsswb  mm1,mm5		; mm1=(03 13 23 33 05 15 25 35)
+
+	paddb     mm6,mm4
+	paddb     mm7,mm4
+	paddb     mm3,mm4
+	paddb     mm1,mm4
+
+	movq      mm2,mm6		; transpose coefficients(phase 1)
+	punpcklbw mm6,mm7		; mm6=(00 01 10 11 20 21 30 31)
+	punpckhbw mm2,mm7		; mm2=(06 07 16 17 26 27 36 37)
+	movq      mm0,mm3		; transpose coefficients(phase 1)
+	punpcklbw mm3,mm1		; mm3=(02 03 12 13 22 23 32 33)
+	punpckhbw mm0,mm1		; mm0=(04 05 14 15 24 25 34 35)
+
+	movq      mm5,mm6		; transpose coefficients(phase 2)
+	punpcklwd mm6,mm3		; mm6=(00 01 02 03 10 11 12 13)
+	punpckhwd mm5,mm3		; mm5=(20 21 22 23 30 31 32 33)
+	movq      mm4,mm0		; transpose coefficients(phase 2)
+	punpcklwd mm0,mm2		; mm0=(04 05 06 07 14 15 16 17)
+	punpckhwd mm4,mm2		; mm4=(24 25 26 27 34 35 36 37)
+
+	movq      mm7,mm6		; transpose coefficients(phase 3)
+	punpckldq mm6,mm0		; mm6=(00 01 02 03 04 05 06 07)
+	punpckhdq mm7,mm0		; mm7=(10 11 12 13 14 15 16 17)
+	movq      mm1,mm5		; transpose coefficients(phase 3)
+	punpckldq mm5,mm4		; mm5=(20 21 22 23 24 25 26 27)
+	punpckhdq mm1,mm4		; mm1=(30 31 32 33 34 35 36 37)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 4*SIZEOF_JCOEF	; wsptr
+	add	edi, byte 4*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
diff --git a/simd/jimmxint.asm b/simd/jimmxint.asm
new file mode 100644
index 0000000..775b1e8
--- /dev/null
+++ b/simd/jimmxint.asm
@@ -0,0 +1,849 @@
+;
+; jimmxint.asm - accurate integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "simd/jsimdext.inc"
+%include "simd/jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS+3)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_islow_mmx)
+
+EXTN(jconst_idct_islow_mmx):
+
+PW_F130_F054	times 2 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 2 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 2 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 2 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 2 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 2 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 2 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 2 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 2 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 2 dd  1 << (DESCALE_P2-1)
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_mmx (void * dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; jpeg_component_info * compptr
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		12
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
+					; JCOEF workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jsimd_idct_islow_mmx)
+
+EXTN(jsimd_idct_islow_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [dct_table(eax)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; JCOEF * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	mm1,mm0
+	packsswb mm1,mm1
+	movd	eax,mm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	mm0,PASS1_BITS
+
+	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
+	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
+	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
+
+	movq      mm1,mm0
+	punpckldq mm0,mm0		; mm0=(00 00 00 00)
+	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
+	movq      mm3,mm2
+	punpckldq mm2,mm2		; mm2=(02 02 02 02)
+	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movq      mm4,mm1		; mm1=in2=z2
+	movq      mm5,mm1
+	punpcklwd mm4,mm3		; mm3=in6=z3
+	punpckhwd mm5,mm3
+	movq      mm1,mm4
+	movq      mm3,mm5
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=tmp3L
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]	; mm5=tmp3H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=tmp2L
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]	; mm3=tmp2H
+
+	movq      mm6,mm0
+	paddw     mm0,mm2		; mm0=in0+in4
+	psubw     mm6,mm2		; mm6=in0-in4
+
+	pxor      mm7,mm7
+	pxor      mm2,mm2
+	punpcklwd mm7,mm0		; mm7=tmp0L
+	punpckhwd mm2,mm0		; mm2=tmp0H
+	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
+	psrad     mm2,(16-CONST_BITS)	; psrad mm2,16 & pslld mm2,CONST_BITS
+
+	movq	mm0,mm7
+	paddd	mm7,mm4			; mm7=tmp10L
+	psubd	mm0,mm4			; mm0=tmp13L
+	movq	mm4,mm2
+	paddd	mm2,mm5			; mm2=tmp10H
+	psubd	mm4,mm5			; mm4=tmp13H
+
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp10L
+	movq	MMWORD [wk(1)], mm2	; wk(1)=tmp10H
+	movq	MMWORD [wk(2)], mm0	; wk(2)=tmp13L
+	movq	MMWORD [wk(3)], mm4	; wk(3)=tmp13H
+
+	pxor      mm5,mm5
+	pxor      mm7,mm7
+	punpcklwd mm5,mm6		; mm5=tmp1L
+	punpckhwd mm7,mm6		; mm7=tmp1H
+	psrad     mm5,(16-CONST_BITS)	; psrad mm5,16 & pslld mm5,CONST_BITS
+	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
+
+	movq	mm2,mm5
+	paddd	mm5,mm1			; mm5=tmp11L
+	psubd	mm2,mm1			; mm2=tmp12L
+	movq	mm0,mm7
+	paddd	mm7,mm3			; mm7=tmp11H
+	psubd	mm0,mm3			; mm0=tmp12H
+
+	movq	MMWORD [wk(4)], mm5	; wk(4)=tmp11L
+	movq	MMWORD [wk(5)], mm7	; wk(5)=tmp11H
+	movq	MMWORD [wk(6)], mm2	; wk(6)=tmp12L
+	movq	MMWORD [wk(7)], mm0	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movq	mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movq	mm5,mm6
+	movq	mm7,mm4
+	paddw	mm5,mm3			; mm5=z3
+	paddw	mm7,mm1			; mm7=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movq      mm2,mm5
+	movq      mm0,mm5
+	punpcklwd mm2,mm7
+	punpckhwd mm0,mm7
+	movq      mm5,mm2
+	movq      mm7,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]	; mm2=z3L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]	; mm0=z3H
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]	; mm5=z4L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]	; mm7=z4H
+
+	movq	MMWORD [wk(10)], mm2	; wk(10)=z3L
+	movq	MMWORD [wk(11)], mm0	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movq      mm2,mm3
+	movq      mm0,mm3
+	punpcklwd mm2,mm4
+	punpckhwd mm0,mm4
+	movq      mm3,mm2
+	movq      mm4,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]	; mm2=tmp0L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]	; mm0=tmp0H
+	pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]	; mm3=tmp3L
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]	; mm4=tmp3H
+
+	paddd	mm2, MMWORD [wk(10)]	; mm2=tmp0L
+	paddd	mm0, MMWORD [wk(11)]	; mm0=tmp0H
+	paddd	mm3,mm5			; mm3=tmp3L
+	paddd	mm4,mm7			; mm4=tmp3H
+
+	movq	MMWORD [wk(8)], mm2	; wk(8)=tmp0L
+	movq	MMWORD [wk(9)], mm0	; wk(9)=tmp0H
+
+	movq      mm2,mm1
+	movq      mm0,mm1
+	punpcklwd mm2,mm6
+	punpckhwd mm0,mm6
+	movq      mm1,mm2
+	movq      mm6,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]	; mm2=tmp1L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]	; mm0=tmp1H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]	; mm1=tmp2L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]	; mm6=tmp2H
+
+	paddd	mm2,mm5			; mm2=tmp1L
+	paddd	mm0,mm7			; mm0=tmp1H
+	paddd	mm1, MMWORD [wk(10)]	; mm1=tmp2L
+	paddd	mm6, MMWORD [wk(11)]	; mm6=tmp2H
+
+	movq	MMWORD [wk(10)], mm2	; wk(10)=tmp1L
+	movq	MMWORD [wk(11)], mm0	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movq	mm5, MMWORD [wk(0)]	; mm5=tmp10L
+	movq	mm7, MMWORD [wk(1)]	; mm7=tmp10H
+
+	movq	mm2,mm5
+	movq	mm0,mm7
+	paddd	mm5,mm3			; mm5=data0L
+	paddd	mm7,mm4			; mm7=data0H
+	psubd	mm2,mm3			; mm2=data7L
+	psubd	mm0,mm4			; mm0=data7H
+
+	movq	mm3,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm3=[PD_DESCALE_P1]
+
+	paddd	mm5,mm3
+	paddd	mm7,mm3
+	psrad	mm5,DESCALE_P1
+	psrad	mm7,DESCALE_P1
+	paddd	mm2,mm3
+	paddd	mm0,mm3
+	psrad	mm2,DESCALE_P1
+	psrad	mm0,DESCALE_P1
+
+	packssdw  mm5,mm7		; mm5=data0=(00 01 02 03)
+	packssdw  mm2,mm0		; mm2=data7=(70 71 72 73)
+
+	movq	mm4, MMWORD [wk(4)]	; mm4=tmp11L
+	movq	mm3, MMWORD [wk(5)]	; mm3=tmp11H
+
+	movq	mm7,mm4
+	movq	mm0,mm3
+	paddd	mm4,mm1			; mm4=data1L
+	paddd	mm3,mm6			; mm3=data1H
+	psubd	mm7,mm1			; mm7=data6L
+	psubd	mm0,mm6			; mm0=data6H
+
+	movq	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm1=[PD_DESCALE_P1]
+
+	paddd	mm4,mm1
+	paddd	mm3,mm1
+	psrad	mm4,DESCALE_P1
+	psrad	mm3,DESCALE_P1
+	paddd	mm7,mm1
+	paddd	mm0,mm1
+	psrad	mm7,DESCALE_P1
+	psrad	mm0,DESCALE_P1
+
+	packssdw  mm4,mm3		; mm4=data1=(10 11 12 13)
+	packssdw  mm7,mm0		; mm7=data6=(60 61 62 63)
+
+	movq      mm6,mm5		; transpose coefficients(phase 1)
+	punpcklwd mm5,mm4		; mm5=(00 10 01 11)
+	punpckhwd mm6,mm4		; mm6=(02 12 03 13)
+	movq      mm1,mm7		; transpose coefficients(phase 1)
+	punpcklwd mm7,mm2		; mm7=(60 70 61 71)
+	punpckhwd mm1,mm2		; mm1=(62 72 63 73)
+
+	movq	mm3, MMWORD [wk(6)]	; mm3=tmp12L
+	movq	mm0, MMWORD [wk(7)]	; mm0=tmp12H
+	movq	mm4, MMWORD [wk(10)]	; mm4=tmp1L
+	movq	mm2, MMWORD [wk(11)]	; mm2=tmp1H
+
+	movq	MMWORD [wk(0)], mm5	; wk(0)=(00 10 01 11)
+	movq	MMWORD [wk(1)], mm6	; wk(1)=(02 12 03 13)
+	movq	MMWORD [wk(4)], mm7	; wk(4)=(60 70 61 71)
+	movq	MMWORD [wk(5)], mm1	; wk(5)=(62 72 63 73)
+
+	movq	mm5,mm3
+	movq	mm6,mm0
+	paddd	mm3,mm4			; mm3=data2L
+	paddd	mm0,mm2			; mm0=data2H
+	psubd	mm5,mm4			; mm5=data5L
+	psubd	mm6,mm2			; mm6=data5H
+
+	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm7=[PD_DESCALE_P1]
+
+	paddd	mm3,mm7
+	paddd	mm0,mm7
+	psrad	mm3,DESCALE_P1
+	psrad	mm0,DESCALE_P1
+	paddd	mm5,mm7
+	paddd	mm6,mm7
+	psrad	mm5,DESCALE_P1
+	psrad	mm6,DESCALE_P1
+
+	packssdw  mm3,mm0		; mm3=data2=(20 21 22 23)
+	packssdw  mm5,mm6		; mm5=data5=(50 51 52 53)
+
+	movq	mm1, MMWORD [wk(2)]	; mm1=tmp13L
+	movq	mm4, MMWORD [wk(3)]	; mm4=tmp13H
+	movq	mm2, MMWORD [wk(8)]	; mm2=tmp0L
+	movq	mm7, MMWORD [wk(9)]	; mm7=tmp0H
+
+	movq	mm0,mm1
+	movq	mm6,mm4
+	paddd	mm1,mm2			; mm1=data3L
+	paddd	mm4,mm7			; mm4=data3H
+	psubd	mm0,mm2			; mm0=data4L
+	psubd	mm6,mm7			; mm6=data4H
+
+	movq	mm2,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm2=[PD_DESCALE_P1]
+
+	paddd	mm1,mm2
+	paddd	mm4,mm2
+	psrad	mm1,DESCALE_P1
+	psrad	mm4,DESCALE_P1
+	paddd	mm0,mm2
+	paddd	mm6,mm2
+	psrad	mm0,DESCALE_P1
+	psrad	mm6,DESCALE_P1
+
+	packssdw  mm1,mm4		; mm1=data3=(30 31 32 33)
+	packssdw  mm0,mm6		; mm0=data4=(40 41 42 43)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=(00 10 01 11)
+	movq	mm2, MMWORD [wk(1)]	; mm2=(02 12 03 13)
+
+	movq      mm4,mm3		; transpose coefficients(phase 1)
+	punpcklwd mm3,mm1		; mm3=(20 30 21 31)
+	punpckhwd mm4,mm1		; mm4=(22 32 23 33)
+	movq      mm6,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm5		; mm0=(40 50 41 51)
+	punpckhwd mm6,mm5		; mm6=(42 52 43 53)
+
+	movq      mm1,mm7		; transpose coefficients(phase 2)
+	punpckldq mm7,mm3		; mm7=(00 10 20 30)
+	punpckhdq mm1,mm3		; mm1=(01 11 21 31)
+	movq      mm5,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm4		; mm2=(02 12 22 32)
+	punpckhdq mm5,mm4		; mm5=(03 13 23 33)
+
+	movq	mm3, MMWORD [wk(4)]	; mm3=(60 70 61 71)
+	movq	mm4, MMWORD [wk(5)]	; mm4=(62 72 63 73)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+	movq      mm7,mm0		; transpose coefficients(phase 2)
+	punpckldq mm0,mm3		; mm0=(40 50 60 70)
+	punpckhdq mm7,mm3		; mm7=(41 51 61 71)
+	movq      mm1,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm4		; mm6=(42 52 62 72)
+	punpckhdq mm1,mm4		; mm1=(43 53 63 73)
+
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_ISLOW_MULT_TYPE	; quantptr
+	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; JCOEF * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movq      mm4,mm1		; mm1=in2=z2
+	movq      mm5,mm1
+	punpcklwd mm4,mm3		; mm3=in6=z3
+	punpckhwd mm5,mm3
+	movq      mm1,mm4
+	movq      mm3,mm5
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=tmp3L
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]	; mm5=tmp3H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=tmp2L
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]	; mm3=tmp2H
+
+	movq      mm6,mm0
+	paddw     mm0,mm2		; mm0=in0+in4
+	psubw     mm6,mm2		; mm6=in0-in4
+
+	pxor      mm7,mm7
+	pxor      mm2,mm2
+	punpcklwd mm7,mm0		; mm7=tmp0L
+	punpckhwd mm2,mm0		; mm2=tmp0H
+	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
+	psrad     mm2,(16-CONST_BITS)	; psrad mm2,16 & pslld mm2,CONST_BITS
+
+	movq	mm0,mm7
+	paddd	mm7,mm4			; mm7=tmp10L
+	psubd	mm0,mm4			; mm0=tmp13L
+	movq	mm4,mm2
+	paddd	mm2,mm5			; mm2=tmp10H
+	psubd	mm4,mm5			; mm4=tmp13H
+
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp10L
+	movq	MMWORD [wk(1)], mm2	; wk(1)=tmp10H
+	movq	MMWORD [wk(2)], mm0	; wk(2)=tmp13L
+	movq	MMWORD [wk(3)], mm4	; wk(3)=tmp13H
+
+	pxor      mm5,mm5
+	pxor      mm7,mm7
+	punpcklwd mm5,mm6		; mm5=tmp1L
+	punpckhwd mm7,mm6		; mm7=tmp1H
+	psrad     mm5,(16-CONST_BITS)	; psrad mm5,16 & pslld mm5,CONST_BITS
+	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
+
+	movq	mm2,mm5
+	paddd	mm5,mm1			; mm5=tmp11L
+	psubd	mm2,mm1			; mm2=tmp12L
+	movq	mm0,mm7
+	paddd	mm7,mm3			; mm7=tmp11H
+	psubd	mm0,mm3			; mm0=tmp12H
+
+	movq	MMWORD [wk(4)], mm5	; wk(4)=tmp11L
+	movq	MMWORD [wk(5)], mm7	; wk(5)=tmp11H
+	movq	MMWORD [wk(6)], mm2	; wk(6)=tmp12L
+	movq	MMWORD [wk(7)], mm0	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movq	mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	movq	mm5,mm6
+	movq	mm7,mm4
+	paddw	mm5,mm3			; mm5=z3
+	paddw	mm7,mm1			; mm7=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movq      mm2,mm5
+	movq      mm0,mm5
+	punpcklwd mm2,mm7
+	punpckhwd mm0,mm7
+	movq      mm5,mm2
+	movq      mm7,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]	; mm2=z3L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]	; mm0=z3H
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]	; mm5=z4L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]	; mm7=z4H
+
+	movq	MMWORD [wk(10)], mm2	; wk(10)=z3L
+	movq	MMWORD [wk(11)], mm0	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movq      mm2,mm3
+	movq      mm0,mm3
+	punpcklwd mm2,mm4
+	punpckhwd mm0,mm4
+	movq      mm3,mm2
+	movq      mm4,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]	; mm2=tmp0L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]	; mm0=tmp0H
+	pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]	; mm3=tmp3L
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]	; mm4=tmp3H
+
+	paddd	mm2, MMWORD [wk(10)]	; mm2=tmp0L
+	paddd	mm0, MMWORD [wk(11)]	; mm0=tmp0H
+	paddd	mm3,mm5			; mm3=tmp3L
+	paddd	mm4,mm7			; mm4=tmp3H
+
+	movq	MMWORD [wk(8)], mm2	; wk(8)=tmp0L
+	movq	MMWORD [wk(9)], mm0	; wk(9)=tmp0H
+
+	movq      mm2,mm1
+	movq      mm0,mm1
+	punpcklwd mm2,mm6
+	punpckhwd mm0,mm6
+	movq      mm1,mm2
+	movq      mm6,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]	; mm2=tmp1L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]	; mm0=tmp1H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]	; mm1=tmp2L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]	; mm6=tmp2H
+
+	paddd	mm2,mm5			; mm2=tmp1L
+	paddd	mm0,mm7			; mm0=tmp1H
+	paddd	mm1, MMWORD [wk(10)]	; mm1=tmp2L
+	paddd	mm6, MMWORD [wk(11)]	; mm6=tmp2H
+
+	movq	MMWORD [wk(10)], mm2	; wk(10)=tmp1L
+	movq	MMWORD [wk(11)], mm0	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movq	mm5, MMWORD [wk(0)]	; mm5=tmp10L
+	movq	mm7, MMWORD [wk(1)]	; mm7=tmp10H
+
+	movq	mm2,mm5
+	movq	mm0,mm7
+	paddd	mm5,mm3			; mm5=data0L
+	paddd	mm7,mm4			; mm7=data0H
+	psubd	mm2,mm3			; mm2=data7L
+	psubd	mm0,mm4			; mm0=data7H
+
+	movq	mm3,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm3=[PD_DESCALE_P2]
+
+	paddd	mm5,mm3
+	paddd	mm7,mm3
+	psrad	mm5,DESCALE_P2
+	psrad	mm7,DESCALE_P2
+	paddd	mm2,mm3
+	paddd	mm0,mm3
+	psrad	mm2,DESCALE_P2
+	psrad	mm0,DESCALE_P2
+
+	packssdw  mm5,mm7		; mm5=data0=(00 10 20 30)
+	packssdw  mm2,mm0		; mm2=data7=(07 17 27 37)
+
+	movq	mm4, MMWORD [wk(4)]	; mm4=tmp11L
+	movq	mm3, MMWORD [wk(5)]	; mm3=tmp11H
+
+	movq	mm7,mm4
+	movq	mm0,mm3
+	paddd	mm4,mm1			; mm4=data1L
+	paddd	mm3,mm6			; mm3=data1H
+	psubd	mm7,mm1			; mm7=data6L
+	psubd	mm0,mm6			; mm0=data6H
+
+	movq	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm1=[PD_DESCALE_P2]
+
+	paddd	mm4,mm1
+	paddd	mm3,mm1
+	psrad	mm4,DESCALE_P2
+	psrad	mm3,DESCALE_P2
+	paddd	mm7,mm1
+	paddd	mm0,mm1
+	psrad	mm7,DESCALE_P2
+	psrad	mm0,DESCALE_P2
+
+	packssdw  mm4,mm3		; mm4=data1=(01 11 21 31)
+	packssdw  mm7,mm0		; mm7=data6=(06 16 26 36)
+
+	packsswb  mm5,mm7		; mm5=(00 10 20 30 06 16 26 36)
+	packsswb  mm4,mm2		; mm4=(01 11 21 31 07 17 27 37)
+
+	movq	mm6, MMWORD [wk(6)]	; mm6=tmp12L
+	movq	mm1, MMWORD [wk(7)]	; mm1=tmp12H
+	movq	mm3, MMWORD [wk(10)]	; mm3=tmp1L
+	movq	mm0, MMWORD [wk(11)]	; mm0=tmp1H
+
+	movq	MMWORD [wk(0)], mm5	; wk(0)=(00 10 20 30 06 16 26 36)
+	movq	MMWORD [wk(1)], mm4	; wk(1)=(01 11 21 31 07 17 27 37)
+
+	movq	mm7,mm6
+	movq	mm2,mm1
+	paddd	mm6,mm3			; mm6=data2L
+	paddd	mm1,mm0			; mm1=data2H
+	psubd	mm7,mm3			; mm7=data5L
+	psubd	mm2,mm0			; mm2=data5H
+
+	movq	mm5,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm5=[PD_DESCALE_P2]
+
+	paddd	mm6,mm5
+	paddd	mm1,mm5
+	psrad	mm6,DESCALE_P2
+	psrad	mm1,DESCALE_P2
+	paddd	mm7,mm5
+	paddd	mm2,mm5
+	psrad	mm7,DESCALE_P2
+	psrad	mm2,DESCALE_P2
+
+	packssdw  mm6,mm1		; mm6=data2=(02 12 22 32)
+	packssdw  mm7,mm2		; mm7=data5=(05 15 25 35)
+
+	movq	mm4, MMWORD [wk(2)]	; mm4=tmp13L
+	movq	mm3, MMWORD [wk(3)]	; mm3=tmp13H
+	movq	mm0, MMWORD [wk(8)]	; mm0=tmp0L
+	movq	mm5, MMWORD [wk(9)]	; mm5=tmp0H
+
+	movq	mm1,mm4
+	movq	mm2,mm3
+	paddd	mm4,mm0			; mm4=data3L
+	paddd	mm3,mm5			; mm3=data3H
+	psubd	mm1,mm0			; mm1=data4L
+	psubd	mm2,mm5			; mm2=data4H
+
+	movq	mm0,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm0=[PD_DESCALE_P2]
+
+	paddd	mm4,mm0
+	paddd	mm3,mm0
+	psrad	mm4,DESCALE_P2
+	psrad	mm3,DESCALE_P2
+	paddd	mm1,mm0
+	paddd	mm2,mm0
+	psrad	mm1,DESCALE_P2
+	psrad	mm2,DESCALE_P2
+
+	movq      mm5,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm5=[PB_CENTERJSAMP]
+
+	packssdw  mm4,mm3		; mm4=data3=(03 13 23 33)
+	packssdw  mm1,mm2		; mm1=data4=(04 14 24 34)
+
+	movq      mm0, MMWORD [wk(0)]	; mm0=(00 10 20 30 06 16 26 36)
+	movq      mm3, MMWORD [wk(1)]	; mm3=(01 11 21 31 07 17 27 37)
+
+	packsswb  mm6,mm1		; mm6=(02 12 22 32 04 14 24 34)
+	packsswb  mm4,mm7		; mm4=(03 13 23 33 05 15 25 35)
+
+	paddb     mm0,mm5
+	paddb     mm3,mm5
+	paddb     mm6,mm5
+	paddb     mm4,mm5
+
+	movq      mm2,mm0		; transpose coefficients(phase 1)
+	punpcklbw mm0,mm3		; mm0=(00 01 10 11 20 21 30 31)
+	punpckhbw mm2,mm3		; mm2=(06 07 16 17 26 27 36 37)
+	movq      mm1,mm6		; transpose coefficients(phase 1)
+	punpcklbw mm6,mm4		; mm6=(02 03 12 13 22 23 32 33)
+	punpckhbw mm1,mm4		; mm1=(04 05 14 15 24 25 34 35)
+
+	movq      mm7,mm0		; transpose coefficients(phase 2)
+	punpcklwd mm0,mm6		; mm0=(00 01 02 03 10 11 12 13)
+	punpckhwd mm7,mm6		; mm7=(20 21 22 23 30 31 32 33)
+	movq      mm5,mm1		; transpose coefficients(phase 2)
+	punpcklwd mm1,mm2		; mm1=(04 05 06 07 14 15 16 17)
+	punpckhwd mm5,mm2		; mm5=(24 25 26 27 34 35 36 37)
+
+	movq      mm3,mm0		; transpose coefficients(phase 3)
+	punpckldq mm0,mm1		; mm0=(00 01 02 03 04 05 06 07)
+	punpckhdq mm3,mm1		; mm3=(10 11 12 13 14 15 16 17)
+	movq      mm4,mm7		; transpose coefficients(phase 3)
+	punpckldq mm7,mm5		; mm7=(20 21 22 23 24 25 26 27)
+	punpckhdq mm4,mm5		; mm4=(30 31 32 33 34 35 36 37)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
+	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 4*SIZEOF_JCOEF	; wsptr
+	add	edi, byte 4*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
diff --git a/simd/jimmxred.asm b/simd/jimmxred.asm
new file mode 100644
index 0000000..a5ad52b
--- /dev/null
+++ b/simd/jimmxred.asm
@@ -0,0 +1,703 @@
+;
+; jimmxred.asm - reduced-size IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; [TAB8]
+
+%include "simd/jsimdext.inc"
+%include "simd/jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1_4	(CONST_BITS-PASS1_BITS+1)
+%define DESCALE_P2_4	(CONST_BITS+PASS1_BITS+3+1)
+%define DESCALE_P1_2	(CONST_BITS-PASS1_BITS+2)
+%define DESCALE_P2_2	(CONST_BITS+PASS1_BITS+3+2)
+
+%if CONST_BITS == 13
+F_0_211	equ	 1730		; FIX(0.211164243)
+F_0_509	equ	 4176		; FIX(0.509795579)
+F_0_601	equ	 4926		; FIX(0.601344887)
+F_0_720	equ	 5906		; FIX(0.720959822)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_850	equ	 6967		; FIX(0.850430095)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_061	equ	 8697		; FIX(1.061594337)
+F_1_272	equ	10426		; FIX(1.272758580)
+F_1_451	equ	11893		; FIX(1.451774981)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_2_172	equ	17799		; FIX(2.172734803)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_624	equ	29692		; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_211	equ	DESCALE( 226735879,30-CONST_BITS)	; FIX(0.211164243)
+F_0_509	equ	DESCALE( 547388834,30-CONST_BITS)	; FIX(0.509795579)
+F_0_601	equ	DESCALE( 645689155,30-CONST_BITS)	; FIX(0.601344887)
+F_0_720	equ	DESCALE( 774124714,30-CONST_BITS)	; FIX(0.720959822)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_850	equ	DESCALE( 913142361,30-CONST_BITS)	; FIX(0.850430095)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_061	equ	DESCALE(1139878239,30-CONST_BITS)	; FIX(1.061594337)
+F_1_272	equ	DESCALE(1366614119,30-CONST_BITS)	; FIX(1.272758580)
+F_1_451	equ	DESCALE(1558831516,30-CONST_BITS)	; FIX(1.451774981)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_172	equ	DESCALE(2332956230,30-CONST_BITS)	; FIX(2.172734803)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_624	equ	DESCALE(3891787747,30-CONST_BITS)	; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_red_mmx)
+
+EXTN(jconst_idct_red_mmx):
+
+PW_F184_MF076	times 2 dw  F_1_847,-F_0_765
+PW_F256_F089	times 2 dw  F_2_562, F_0_899
+PW_F106_MF217	times 2 dw  F_1_061,-F_2_172
+PW_MF060_MF050	times 2 dw -F_0_601,-F_0_509
+PW_F145_MF021	times 2 dw  F_1_451,-F_0_211
+PW_F362_MF127	times 2 dw  F_3_624,-F_1_272
+PW_F085_MF072	times 2 dw  F_0_850,-F_0_720
+PD_DESCALE_P1_4	times 2 dd  1 << (DESCALE_P1_4-1)
+PD_DESCALE_P2_4	times 2 dd  1 << (DESCALE_P2_4-1)
+PD_DESCALE_P1_2	times 2 dd  1 << (DESCALE_P1_2-1)
+PD_DESCALE_P2_2	times 2 dd  1 << (DESCALE_P2_2-1)
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_mmx (void * dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; void * dct_table
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
+					; JCOEF workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jsimd_idct_4x4_mmx)
+
+EXTN(jsimd_idct_4x4_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [dct_table(eax)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; JCOEF * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	mm0,mm1
+	packsswb mm0,mm0
+	movd	eax,mm0
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	mm0,PASS1_BITS
+
+	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
+	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
+	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
+
+	movq      mm1,mm0
+	punpckldq mm0,mm0		; mm0=(00 00 00 00)
+	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
+	movq      mm3,mm2
+	punpckldq mm2,mm2		; mm2=(02 02 02 02)
+	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movq      mm4,mm0
+	movq      mm5,mm0
+	punpcklwd mm4,mm1
+	punpckhwd mm5,mm1
+	movq      mm0,mm4
+	movq      mm1,mm5
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]	; mm4=(tmp2L)
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]	; mm5=(tmp2H)
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]	; mm0=(tmp0L)
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]	; mm1=(tmp0H)
+
+	movq      mm6,mm2
+	movq      mm7,mm2
+	punpcklwd mm6,mm3
+	punpckhwd mm7,mm3
+	movq      mm2,mm6
+	movq      mm3,mm7
+	pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]	; mm6=(tmp2L)
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]	; mm7=(tmp2H)
+	pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]	; mm2=(tmp0L)
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]	; mm3=(tmp0H)
+
+	paddd	mm6,mm4			; mm6=tmp2L
+	paddd	mm7,mm5			; mm7=tmp2H
+	paddd	mm2,mm0			; mm2=tmp0L
+	paddd	mm3,mm1			; mm3=tmp0H
+
+	movq	MMWORD [wk(0)], mm2	; wk(0)=tmp0L
+	movq	MMWORD [wk(1)], mm3	; wk(1)=tmp0H
+
+	; -- Even part
+
+	movq	mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	pxor      mm1,mm1
+	pxor      mm2,mm2
+	punpcklwd mm1,mm4		; mm1=tmp0L
+	punpckhwd mm2,mm4		; mm2=tmp0H
+	psrad     mm1,(16-CONST_BITS-1)	; psrad mm1,16 & pslld mm1,CONST_BITS+1
+	psrad     mm2,(16-CONST_BITS-1)	; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+	movq      mm3,mm5		; mm5=in2=z2
+	punpcklwd mm5,mm0		; mm0=in6=z3
+	punpckhwd mm3,mm0
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]	; mm5=tmp2L
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]	; mm3=tmp2H
+
+	movq	mm4,mm1
+	movq	mm0,mm2
+	paddd	mm1,mm5			; mm1=tmp10L
+	paddd	mm2,mm3			; mm2=tmp10H
+	psubd	mm4,mm5			; mm4=tmp12L
+	psubd	mm0,mm3			; mm0=tmp12H
+
+	; -- Final output stage
+
+	movq	mm5,mm1
+	movq	mm3,mm2
+	paddd	mm1,mm6			; mm1=data0L
+	paddd	mm2,mm7			; mm2=data0H
+	psubd	mm5,mm6			; mm5=data3L
+	psubd	mm3,mm7			; mm3=data3H
+
+	movq	mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; mm6=[PD_DESCALE_P1_4]
+
+	paddd	mm1,mm6
+	paddd	mm2,mm6
+	psrad	mm1,DESCALE_P1_4
+	psrad	mm2,DESCALE_P1_4
+	paddd	mm5,mm6
+	paddd	mm3,mm6
+	psrad	mm5,DESCALE_P1_4
+	psrad	mm3,DESCALE_P1_4
+
+	packssdw  mm1,mm2		; mm1=data0=(00 01 02 03)
+	packssdw  mm5,mm3		; mm5=data3=(30 31 32 33)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=tmp0L
+	movq	mm6, MMWORD [wk(1)]	; mm6=tmp0H
+
+	movq	mm2,mm4
+	movq	mm3,mm0
+	paddd	mm4,mm7			; mm4=data1L
+	paddd	mm0,mm6			; mm0=data1H
+	psubd	mm2,mm7			; mm2=data2L
+	psubd	mm3,mm6			; mm3=data2H
+
+	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; mm7=[PD_DESCALE_P1_4]
+
+	paddd	mm4,mm7
+	paddd	mm0,mm7
+	psrad	mm4,DESCALE_P1_4
+	psrad	mm0,DESCALE_P1_4
+	paddd	mm2,mm7
+	paddd	mm3,mm7
+	psrad	mm2,DESCALE_P1_4
+	psrad	mm3,DESCALE_P1_4
+
+	packssdw  mm4,mm0		; mm4=data1=(10 11 12 13)
+	packssdw  mm2,mm3		; mm2=data2=(20 21 22 23)
+
+	movq      mm6,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm4		; mm1=(00 10 01 11)
+	punpckhwd mm6,mm4		; mm6=(02 12 03 13)
+	movq      mm7,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm5		; mm2=(20 30 21 31)
+	punpckhwd mm7,mm5		; mm7=(22 32 23 33)
+
+	movq      mm0,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm2		; mm1=(00 10 20 30)
+	punpckhdq mm0,mm2		; mm0=(01 11 21 31)
+	movq      mm3,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm7		; mm6=(02 12 22 32)
+	punpckhdq mm3,mm7		; mm3=(03 13 23 33)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_ISLOW_MULT_TYPE	; quantptr
+	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; JCOEF * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	movq      mm4,mm0
+	movq      mm5,mm0
+	punpcklwd mm4,mm1
+	punpckhwd mm5,mm1
+	movq      mm0,mm4
+	movq      mm1,mm5
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]	; mm4=(tmp2L)
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]	; mm5=(tmp2H)
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]	; mm0=(tmp0L)
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]	; mm1=(tmp0H)
+
+	movq      mm6,mm2
+	movq      mm7,mm2
+	punpcklwd mm6,mm3
+	punpckhwd mm7,mm3
+	movq      mm2,mm6
+	movq      mm3,mm7
+	pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]	; mm6=(tmp2L)
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]	; mm7=(tmp2H)
+	pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]	; mm2=(tmp0L)
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]	; mm3=(tmp0H)
+
+	paddd	mm6,mm4			; mm6=tmp2L
+	paddd	mm7,mm5			; mm7=tmp2H
+	paddd	mm2,mm0			; mm2=tmp0L
+	paddd	mm3,mm1			; mm3=tmp0H
+
+	movq	MMWORD [wk(0)], mm2	; wk(0)=tmp0L
+	movq	MMWORD [wk(1)], mm3	; wk(1)=tmp0H
+
+	; -- Even part
+
+	movq	mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	pxor      mm1,mm1
+	pxor      mm2,mm2
+	punpcklwd mm1,mm4		; mm1=tmp0L
+	punpckhwd mm2,mm4		; mm2=tmp0H
+	psrad     mm1,(16-CONST_BITS-1)	; psrad mm1,16 & pslld mm1,CONST_BITS+1
+	psrad     mm2,(16-CONST_BITS-1)	; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+	movq      mm3,mm5		; mm5=in2=z2
+	punpcklwd mm5,mm0		; mm0=in6=z3
+	punpckhwd mm3,mm0
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]	; mm5=tmp2L
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]	; mm3=tmp2H
+
+	movq	mm4,mm1
+	movq	mm0,mm2
+	paddd	mm1,mm5			; mm1=tmp10L
+	paddd	mm2,mm3			; mm2=tmp10H
+	psubd	mm4,mm5			; mm4=tmp12L
+	psubd	mm0,mm3			; mm0=tmp12H
+
+	; -- Final output stage
+
+	movq	mm5,mm1
+	movq	mm3,mm2
+	paddd	mm1,mm6			; mm1=data0L
+	paddd	mm2,mm7			; mm2=data0H
+	psubd	mm5,mm6			; mm5=data3L
+	psubd	mm3,mm7			; mm3=data3H
+
+	movq	mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; mm6=[PD_DESCALE_P2_4]
+
+	paddd	mm1,mm6
+	paddd	mm2,mm6
+	psrad	mm1,DESCALE_P2_4
+	psrad	mm2,DESCALE_P2_4
+	paddd	mm5,mm6
+	paddd	mm3,mm6
+	psrad	mm5,DESCALE_P2_4
+	psrad	mm3,DESCALE_P2_4
+
+	packssdw  mm1,mm2		; mm1=data0=(00 10 20 30)
+	packssdw  mm5,mm3		; mm5=data3=(03 13 23 33)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=tmp0L
+	movq	mm6, MMWORD [wk(1)]	; mm6=tmp0H
+
+	movq	mm2,mm4
+	movq	mm3,mm0
+	paddd	mm4,mm7			; mm4=data1L
+	paddd	mm0,mm6			; mm0=data1H
+	psubd	mm2,mm7			; mm2=data2L
+	psubd	mm3,mm6			; mm3=data2H
+
+	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; mm7=[PD_DESCALE_P2_4]
+
+	paddd	mm4,mm7
+	paddd	mm0,mm7
+	psrad	mm4,DESCALE_P2_4
+	psrad	mm0,DESCALE_P2_4
+	paddd	mm2,mm7
+	paddd	mm3,mm7
+	psrad	mm2,DESCALE_P2_4
+	psrad	mm3,DESCALE_P2_4
+
+	packssdw  mm4,mm0		; mm4=data1=(01 11 21 31)
+	packssdw  mm2,mm3		; mm2=data2=(02 12 22 32)
+
+	movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm6=[PB_CENTERJSAMP]
+
+	packsswb  mm1,mm2		; mm1=(00 10 20 30 02 12 22 32)
+	packsswb  mm4,mm5		; mm4=(01 11 21 31 03 13 23 33)
+	paddb     mm1,mm6
+	paddb     mm4,mm6
+
+	movq      mm7,mm1		; transpose coefficients(phase 1)
+	punpcklbw mm1,mm4		; mm1=(00 01 10 11 20 21 30 31)
+	punpckhbw mm7,mm4		; mm7=(02 03 12 13 22 23 32 33)
+
+	movq      mm0,mm1		; transpose coefficients(phase 2)
+	punpcklwd mm1,mm7		; mm1=(00 01 02 03 10 11 12 13)
+	punpckhwd mm0,mm7		; mm0=(20 21 22 23 30 31 32 33)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	movd	DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
+	movd	DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+
+	psrlq	mm1,4*BYTE_BIT
+	psrlq	mm0,4*BYTE_BIT
+
+	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movd	DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
+	movd	DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_mmx (void * dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; void * dct_table
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+	align	16
+	global	EXTN(jsimd_idct_2x2_mmx)
+
+EXTN(jsimd_idct_2x2_mmx):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input.
+
+	mov	edx, POINTER [dct_table(ebp)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
+
+	; | input:                  | result:        |
+	; | 00 01 ** 03 ** 05 ** 07 |                |
+	; | 10 11 ** 13 ** 15 ** 17 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+	; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+	; | 50 51 ** 53 ** 55 ** 57 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 70 71 ** 73 ** 75 ** 77 |                |
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
+	; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
+
+	pcmpeqd   mm7,mm7
+	pslld     mm7,WORD_BIT		; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
+
+	movq      mm4,mm0		; mm4=(10 11 ** 13)
+	movq      mm5,mm2		; mm5=(50 51 ** 53)
+	punpcklwd mm4,mm1		; mm4=(10 30 11 31)
+	punpcklwd mm5,mm3		; mm5=(50 70 51 71)
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
+
+	psrld	mm0,WORD_BIT		; mm0=(11 -- 13 --)
+	pand	mm1,mm7			; mm1=(-- 31 -- 33)
+	psrld	mm2,WORD_BIT		; mm2=(51 -- 53 --)
+	pand	mm3,mm7			; mm3=(-- 71 -- 73)
+	por	mm0,mm1			; mm0=(11 31 13 33)
+	por	mm2,mm3			; mm2=(51 71 53 73)
+	pmaddwd	mm0,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd	mm2,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd	mm4,mm5			; mm4=tmp0[col0 col1]
+
+	movq	mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
+	pmullw	mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
+	pmullw	mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
+	; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
+
+	psrld	mm6,WORD_BIT		; mm6=(15 -- 17 --)
+	pand	mm1,mm7			; mm1=(-- 35 -- 37)
+	psrld	mm3,WORD_BIT		; mm3=(55 -- 57 --)
+	pand	mm5,mm7			; mm5=(-- 75 -- 77)
+	por	mm6,mm1			; mm6=(15 35 17 37)
+	por	mm3,mm5			; mm3=(55 75 57 77)
+	pmaddwd	mm6,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd	mm3,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd	mm0,mm2			; mm0=tmp0[col1 col3]
+	paddd	mm6,mm3			; mm6=tmp0[col5 col7]
+
+	; -- Even part
+
+	movq	mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
+	pmullw	mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
+
+	movq	mm2,mm1				; mm2=(00 01 ** 03)
+	pslld	mm1,WORD_BIT			; mm1=(-- 00 -- **)
+	psrad	mm1,(WORD_BIT-CONST_BITS-2)	; mm1=tmp10[col0 ****]
+
+	pand	mm2,mm7				; mm2=(-- 01 -- 03)
+	pand	mm5,mm7				; mm5=(-- 05 -- 07)
+	psrad	mm2,(WORD_BIT-CONST_BITS-2)	; mm2=tmp10[col1 col3]
+	psrad	mm5,(WORD_BIT-CONST_BITS-2)	; mm5=tmp10[col5 col7]
+
+	; -- Final output stage
+
+	movq      mm3,mm1
+	paddd     mm1,mm4		; mm1=data0[col0 ****]=(A0 **)
+	psubd     mm3,mm4		; mm3=data1[col0 ****]=(B0 **)
+	punpckldq mm1,mm3		; mm1=(A0 B0)
+
+	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)]	; mm7=[PD_DESCALE_P1_2]
+
+	movq	mm4,mm2
+	movq	mm3,mm5
+	paddd	mm2,mm0			; mm2=data0[col1 col3]=(A1 A3)
+	paddd	mm5,mm6			; mm5=data0[col5 col7]=(A5 A7)
+	psubd	mm4,mm0			; mm4=data1[col1 col3]=(B1 B3)
+	psubd	mm3,mm6			; mm3=data1[col5 col7]=(B5 B7)
+
+	paddd	mm1,mm7
+	psrad	mm1,DESCALE_P1_2
+
+	paddd	mm2,mm7
+	paddd	mm5,mm7
+	psrad	mm2,DESCALE_P1_2
+	psrad	mm5,DESCALE_P1_2
+	paddd	mm4,mm7
+	paddd	mm3,mm7
+	psrad	mm4,DESCALE_P1_2
+	psrad	mm3,DESCALE_P1_2
+
+	; ---- Pass 2: process rows, store into output array.
+
+	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(ebp)]
+
+	; | input:| result:|
+	; | A0 B0 |        |
+	; | A1 B1 | C0 C1  |
+	; | A3 B3 | D0 D1  |
+	; | A5 B5 |        |
+	; | A7 B7 |        |
+
+	; -- Odd part
+
+	packssdw  mm2,mm4		; mm2=(A1 A3 B1 B3)
+	packssdw  mm5,mm3		; mm5=(A5 A7 B5 B7)
+	pmaddwd   mm2,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd     mm2,mm5		; mm2=tmp0[row0 row1]
+
+	; -- Even part
+
+	pslld     mm1,(CONST_BITS+2)	; mm1=tmp10[row0 row1]
+
+	; -- Final output stage
+
+	movq      mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)]	; mm0=[PD_DESCALE_P2_2]
+
+	movq      mm6,mm1
+	paddd     mm1,mm2		; mm1=data0[row0 row1]=(C0 C1)
+	psubd     mm6,mm2		; mm6=data1[row0 row1]=(D0 D1)
+
+	paddd     mm1,mm0
+	paddd     mm6,mm0
+	psrad     mm1,DESCALE_P2_2
+	psrad     mm6,DESCALE_P2_2
+
+	movq      mm7,mm1		; transpose coefficients
+	punpckldq mm1,mm6		; mm1=(C0 D0)
+	punpckhdq mm7,mm6		; mm7=(C1 D1)
+
+	packssdw  mm1,mm7		; mm1=(C0 D0 C1 D1)
+	packsswb  mm1,mm1		; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
+	paddb     mm1,[GOTOFF(ebx,PB_CENTERJSAMP)]
+
+	movd	ecx,mm1
+	movd	ebx,mm1			; ebx=(C0 D0 C1 D1)
+	shr	ecx,2*BYTE_BIT		; ecx=(C1 D1 -- --)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	WORD [edx+eax*SIZEOF_JSAMPLE], bx
+	mov	WORD [esi+eax*SIZEOF_JSAMPLE], cx
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
diff --git a/simd/jsimd.h b/simd/jsimd.h
index 09ec46d..be48b84 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -11,13 +11,108 @@
 /* Bitmask for supported acceleration methods */
 
 #define JSIMD_NONE    0x00
+#define JSIMD_MMX     0x01
 
 /* Short forms of external names for systems with brain-damaged linkers. */
 
 #ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jpeg_simd_cpu_support	jSiCpuSupport
+#define jpeg_simd_cpu_support                 jSiCpuSupport
+#define jsimd_rgb_ycc_convert_mmx             jSRGBYCCM
+#define jsimd_ycc_rgb_convert_mmx             jSYCCRGBM
+#define jsimd_h2v2_downsample_mmx             jSDnH2V2M
+#define jsimd_h2v1_downsample_mmx             jSDnH2V1M
+#define jsimd_h2v2_upsample_mmx               jSUpH2V2M
+#define jsimd_h2v1_upsample_mmx               jSUpH2V1M
+#define jsimd_h2v2_fancy_upsample_mmx         jSFUpH2V2M
+#define jsimd_h2v1_fancy_upsample_mmx         jSFUpH2V1M
+#define jsimd_h2v2_merged_upsample_mmx        jSMUpH2V2M
+#define jsimd_h2v1_merged_upsample_mmx        jSMUpH2V1M
+#define jsimd_convsamp_mmx                    jSConvM
+#define jsimd_fdct_islow_mmx                  jSFDMIS
+#define jsimd_fdct_ifast_mmx                  jSFDMIF
+#define jsimd_quantize_mmx                    jSQuantM
+#define jsimd_idct_2x2_mmx                    jSIDM22
+#define jsimd_idct_4x4_mmx                    jSIDM44
+#define jsimd_idct_islow_mmx                  jSIDMIS
+#define jsimd_idct_ifast_mmx                  jSIDMIF
 #endif /* NEED_SHORT_EXTERNAL_NAMES */
 
 /* SIMD Ext: retrieve SIMD/CPU information */
 EXTERN(unsigned int) jpeg_simd_cpu_support JPP((void));
 
+/* SIMD Color Space Conversion */
+EXTERN(void) jsimd_rgb_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_ycc_rgb_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+
+/* SIMD Downsample */
+EXTERN(void) jsimd_h2v2_downsample_mmx
+        JPP((JDIMENSION image_width, int max_v_samp_factor,
+             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jsimd_h2v1_downsample_mmx
+        JPP((JDIMENSION image_width, int max_v_samp_factor,
+             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+
+/* SIMD Upsample */
+EXTERN(void) jsimd_h2v2_upsample_mmx
+        JPP((int max_v_samp_factor, JDIMENSION output_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_upsample_mmx
+        JPP((int max_v_samp_factor, JDIMENSION output_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+EXTERN(void) jsimd_h2v2_fancy_upsample_mmx
+        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_fancy_upsample_mmx
+        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+EXTERN(void) jsimd_h2v2_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+
+/* SIMD Sample Conversion */
+EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
+                                     JDIMENSION start_col,
+                                     DCTELEM * workspace));
+
+/* SIMD Forward DCT */
+EXTERN(void) jsimd_fdct_islow_mmx JPP((DCTELEM * data));
+EXTERN(void) jsimd_fdct_ifast_mmx JPP((DCTELEM * data));
+
+/* SIMD Quantization */
+EXTERN(void) jsimd_quantize_mmx JPP((JCOEFPTR coef_block,
+                                     DCTELEM * divisors,
+                                     DCTELEM * workspace));
+
+/* SIMD Reduced Inverse DCT */
+EXTERN(void) jsimd_idct_2x2_mmx JPP((void * dct_table,
+                                     JCOEFPTR coef_block,
+                                     JSAMPARRAY output_buf,
+                                     JDIMENSION output_col));
+EXTERN(void) jsimd_idct_4x4_mmx JPP((void * dct_table,
+                                     JCOEFPTR coef_block,
+                                     JSAMPARRAY output_buf,
+                                     JDIMENSION output_col));
+
+/* SIMD Inverse DCT */
+EXTERN(void) jsimd_idct_islow_mmx JPP((void * dct_table,
+                                       JCOEFPTR coef_block,
+                                       JSAMPARRAY output_buf,
+                                       JDIMENSION output_col));
+EXTERN(void) jsimd_idct_ifast_mmx JPP((void * dct_table,
+                                       JCOEFPTR coef_block,
+                                       JSAMPARRAY output_buf,
+                                       JDIMENSION output_col));
+
diff --git a/simd/jsimdcfg.inc.h b/simd/jsimdcfg.inc.h
index bc2ff9f..8fcbe47 100644
--- a/simd/jsimdcfg.inc.h
+++ b/simd/jsimdcfg.inc.h
@@ -1,5 +1,8 @@
 // This file generates the include file for the assembly
 // implementations by abusing the C preprocessor.
+//
+// Note: Some things are manually defined as they need to
+// be mapped to NASM types.
 
 ;
 ; Automatically generated include file from jsimdcfg.inc.h
@@ -16,14 +19,100 @@
 #define definev(var) %define _cpp_protection_##var var
 
 ;
+; -- jpeglib.h
+;
+
+definev(DCTSIZE)
+definev(DCTSIZE2)
+
+;
+; -- jmorecfg.h
+;
+
+definev(RGB_RED)
+definev(RGB_GREEN)
+definev(RGB_BLUE)
+
+definev(RGB_PIXELSIZE)
+
+; Representation of a single sample (pixel element value).
+; On this SIMD implementation, this must be 'unsigned char'.
+;
+
+%define JSAMPLE                 byte          ; unsigned char
+%define SIZEOF_JSAMPLE          SIZEOF_BYTE   ; sizeof(JSAMPLE)
+
+definev(CENTERJSAMPLE)
+
+; Representation of a DCT frequency coefficient.
+; On this SIMD implementation, this must be 'short'.
+;
+%define JCOEF                   word          ; short
+%define SIZEOF_JCOEF            SIZEOF_WORD   ; sizeof(JCOEF)
+
+; Datatype used for image dimensions.
+; On this SIMD implementation, this must be 'unsigned int'.
+;
+%define JDIMENSION              dword         ; unsigned int
+%define SIZEOF_JDIMENSION       SIZEOF_DWORD  ; sizeof(JDIMENSION)
+
+%define JSAMPROW                POINTER       ; JSAMPLE FAR * (jpeglib.h)
+%define JSAMPARRAY              POINTER       ; JSAMPROW *    (jpeglib.h)
+%define JSAMPIMAGE              POINTER       ; JSAMPARRAY *  (jpeglib.h)
+%define JCOEFPTR                POINTER       ; JCOEF FAR *   (jpeglib.h)
+%define SIZEOF_JSAMPROW         SIZEOF_POINTER  ; sizeof(JSAMPROW)
+%define SIZEOF_JSAMPARRAY       SIZEOF_POINTER  ; sizeof(JSAMPARRAY)
+%define SIZEOF_JSAMPIMAGE       SIZEOF_POINTER  ; sizeof(JSAMPIMAGE)
+%define SIZEOF_JCOEFPTR         SIZEOF_POINTER  ; sizeof(JCOEFPTR)
+
+;
+; -- jdct.h
+;
+
+; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
+; the DCT is to be performed in-place in that buffer.
+; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
+;
+%define DCTELEM                 word          ; short
+%define SIZEOF_DCTELEM          SIZEOF_WORD   ; sizeof(DCTELEM)
+
+; To maximize parallelism, Type MULTIPLIER is changed to short.
+;
+%define ISLOW_MULT_TYPE         word          ; must be short
+%define SIZEOF_ISLOW_MULT_TYPE  SIZEOF_WORD   ; sizeof(ISLOW_MULT_TYPE)
+
+%define IFAST_MULT_TYPE         word          ; must be short
+%define SIZEOF_IFAST_MULT_TYPE  SIZEOF_WORD   ; sizeof(IFAST_MULT_TYPE)
+%define IFAST_SCALE_BITS        2             ; fractional bits in scale factors
+
+;
 ; -- jsimd.h
 ;
 
 definev(JSIMD_NONE)
+definev(JSIMD_MMX)
 
 ; Short forms of external names for systems with brain-damaged linkers.
 ;
 #ifdef NEED_SHORT_EXTERNAL_NAMES
 definev(jpeg_simd_cpu_support)
+definev(jsimd_rgb_ycc_convert_mmx)
+definev(jsimd_ycc_rgb_convert_mmx)
+definev(jsimd_h2v2_downsample_mmx)
+definev(jsimd_h2v1_downsample_mmx)
+definev(jsimd_h2v2_upsample_mmx)
+definev(jsimd_h2v1_upsample_mmx)
+definev(jsimd_h2v1_fancy_upsample_mmx)
+definev(jsimd_h2v2_fancy_upsample_mmx)
+definev(jsimd_h2v1_merged_upsample_mmx)
+definev(jsimd_h2v2_merged_upsample_mmx)
+definev(jsimd_convsamp_mmx)
+definev(jsimd_fdct_islow_mmx)
+definev(jsimd_fdct_ifast_mmx)
+definev(jsimd_quantize_mmx)
+definev(jsimd_idct_2x2_mmx)
+definev(jsimd_idct_4x4_mmx)
+definev(jsimd_idct_islow_mmx)
+definev(jsimd_idct_ifast_mmx)
 #endif /* NEED_SHORT_EXTERNAL_NAMES */
 
diff --git a/simd/jsimdcpu.asm b/simd/jsimdcpu.asm
index 6bb4e32..862de29 100644
--- a/simd/jsimdcpu.asm
+++ b/simd/jsimdcpu.asm
@@ -51,6 +51,22 @@ EXTN(jpeg_simd_cpu_support):
 	xor	eax,edx
 	jz	short .return		; CPUID is not supported
 
+	; Check for MMX instruction support
+	xor	eax,eax
+	cpuid
+	test	eax,eax
+	jz	short .return
+
+	xor	eax,eax
+	inc	eax
+	cpuid
+	mov	eax,edx			; eax = Standard feature flags
+
+	test	eax, 1<<23		; bit23:MMX
+	jz	short .no_mmx
+	or	edi, byte JSIMD_MMX
+.no_mmx:
+
 .return:
 	mov	eax,edi
 
diff --git a/simd/jsimdext.inc b/simd/jsimdext.inc
index c0bd54c..08e3a04 100644
--- a/simd/jsimdext.inc
+++ b/simd/jsimdext.inc
@@ -107,6 +107,24 @@
 %define SIZEOF_POINTER          SIZEOF_DWORD    ; sizeof(POINTER)
 %define POINTER_BIT             DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
 
+%define INT                     dword           ; signed integer type
+%define SIZEOF_INT              SIZEOF_DWORD    ; sizeof(INT)
+%define INT_BIT                 DWORD_BIT       ; sizeof(INT)*BYTE_BIT
+
+%define MMWORD                  qword           ; int64  (MMX register)
+%define SIZEOF_MMWORD           SIZEOF_QWORD    ; sizeof(MMWORD)
+%define MMWORD_BIT              QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT
+
+%define SIZEOF_BYTE             1               ; sizeof(BYTE)
+%define SIZEOF_WORD             2               ; sizeof(WORD)
+%define SIZEOF_DWORD            4               ; sizeof(DWORD)
+%define SIZEOF_QWORD            8               ; sizeof(QWORD)
+
+%define BYTE_BIT                8               ; CHAR_BIT in C
+%define WORD_BIT                16              ; sizeof(WORD)*BYTE_BIT
+%define DWORD_BIT               32              ; sizeof(DWORD)*BYTE_BIT
+%define QWORD_BIT               64              ; sizeof(QWORD)*BYTE_BIT
+
 ; --------------------------------------------------------------------------
 ;  External Symbol Name
 ;
author	ossman_ <ossman_@632fc199-4ca6-4c93-a231-07263d6284db>	2009-03-09 13:25:30 +0000
committer	ossman_ <ossman_@632fc199-4ca6-4c93-a231-07263d6284db>	2009-03-09 13:25:30 +0000
commit	65984f5ac9933a1fc74c22da22671156f9b9de15 (patch)
tree	809d3bf09560d2e99d61f190ea2d16e6d3122b13 /simd
parent	ddd9d54d5e181e76197c787220f3bc46e206deba (diff)