summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordcommander <dcommander@3789f03b-4d11-0410-bbf8-ca57d06f2519>2012-06-15 21:58:06 +0000
committerdcommander <dcommander@3789f03b-4d11-0410-bbf8-ca57d06f2519>2012-06-15 21:58:06 +0000
commitbdc5c7e3e8d5a0b0b1336f433a2cea08a4e606bf (patch)
tree58c03b1eb58dee37246cd717d493893760097f2b
parent968435b9d1895c339b5a762d1ef1d82e4e9b51f2 (diff)
Fixed regression caused by a bug in the 32-bit strict memory access code in jdmrgss2.asm (contributed by Chromium to stop valgrind from whining whenever the output buffer size was not evenly divisible by 16 bytes.) On Linux/x86, this regression caused incorrect pixels on the right-hand side of images whose rows were not 16-byte aligned, whenever fancy upsampling was used. This patch also enables the strict memory access code on all platforms, not just Linux (it does no harm on other platforms) and removes a couple of pcmpeqb instructions that were rendered unnecessary by r836.
git-svn-id: https://libjpeg-turbo.svn.sourceforge.net/svnroot/libjpeg-turbo@839 3789f03b-4d11-0410-bbf8-ca57d06f2519
-rw-r--r--trunk/ChangeLog.txt6
-rw-r--r--trunk/simd/jdclrss2-64.asm88
-rw-r--r--trunk/simd/jdclrss2.asm88
-rw-r--r--trunk/simd/jdmrgss2-64.asm89
-rw-r--r--trunk/simd/jdmrgss2.asm92
-rw-r--r--trunk/simd/jsimdext.inc2
6 files changed, 13 insertions, 352 deletions
diff --git a/trunk/ChangeLog.txt b/trunk/ChangeLog.txt
index 5453869..c613b1c 100644
--- a/trunk/ChangeLog.txt
+++ b/trunk/ChangeLog.txt
@@ -44,6 +44,12 @@ it is painfully slow on Bobcat processors in particular. Eliminating the use
of this instruction improved performance by an order of magnitude on Bobcat
processors and by a small amount (typically 5%) on AMD desktop processors.
+[10] Fixed a regression caused by 1.2.0[2] whereby, on Linux/x86 platforms,
+decompressing a 4:2:0 or 4:2:2 JPEG image without using fancy upsampling would
+produce several incorrect columns of pixels at the right-hand side of the
+output image if each row in the output image was not evenly divisible by 16
+bytes.
+
1.2.0
=====
diff --git a/trunk/simd/jdclrss2-64.asm b/trunk/simd/jdclrss2-64.asm
index 06cb213..9b2e930 100644
--- a/trunk/simd/jdclrss2-64.asm
+++ b/trunk/simd/jdclrss2-64.asm
@@ -1,7 +1,7 @@
;
; jdclrss2-64.asm - colorspace conversion (64-bit SSE2)
;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright 2009 D. R. Commander
;
; Based on
@@ -267,7 +267,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
jmp near .columnloop
.column_st32:
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
cmp rcx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
@@ -285,7 +284,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_MMWORD
@@ -319,47 +317,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
test rcx, rcx
jz short .nextrow
mov BYTE [rdi], al
-%else
- mov rax,rcx
- xor rcx, byte 0x0F
- shl rcx, 2
- movd xmmB,ecx
- psrlq xmmH,4
- pcmpeqb xmmE,xmmE
- psrlq xmmH,xmmB
- psrlq xmmE,xmmB
- punpcklbw xmmE,xmmH
- ; ----------------
- mov rcx,rdi
- and rcx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- add rax,rcx
- cmp rax, byte SIZEOF_XMMWORD
- ja short .adj0
- and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,rcx
- movdqa xmmG,xmmA
- movdqa xmmC,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmD,ecx
- sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmF,ecx
- psllq xmmA,xmmF
- psllq xmmE,xmmF
- jmp short .adj0
-.adj1: neg ecx
- movd xmmF,ecx
- psrlq xmmA,xmmF
- psrlq xmmE,xmmF
- psllq xmmG,xmmD
- psllq xmmC,xmmD
- por xmmA,xmmG
- por xmmE,xmmC
-.adj0: ; ----------------
- movdqu XMMWORD [rdi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -421,7 +378,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
jmp near .columnloop
.column_st32:
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
@@ -438,7 +394,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_XMMWORD/8
@@ -453,47 +408,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
test rcx, rcx
jz short .nextrow
movd DWORD [rdi], xmmA
-%else
- cmp rcx, byte SIZEOF_XMMWORD/16
- jb near .nextrow
- mov rax,rcx
- xor rcx, byte 0x03
- inc rcx
- shl rcx, 4
- movd xmmF,ecx
- psrlq xmmE,xmmF
- punpcklbw xmmE,xmmE
- ; ----------------
- mov rcx,rdi
- and rcx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- lea rax, [rcx+rax*4] ; RGB_PIXELSIZE
- cmp rax, byte SIZEOF_XMMWORD
- ja short .adj0
- and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmB,xmmA
- movdqa xmmG,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmC,ecx
- sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmH,ecx
- psllq xmmA,xmmH
- psllq xmmE,xmmH
- jmp short .adj0
-.adj1: neg rcx
- movd xmmH,ecx
- psrlq xmmA,xmmH
- psrlq xmmE,xmmH
- psllq xmmB,xmmC
- psllq xmmG,xmmC
- por xmmA,xmmB
- por xmmE,xmmG
-.adj0: ; ----------------
- movdqu XMMWORD [rdi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
diff --git a/trunk/simd/jdclrss2.asm b/trunk/simd/jdclrss2.asm
index 1354c3d..d26a5bb 100644
--- a/trunk/simd/jdclrss2.asm
+++ b/trunk/simd/jdclrss2.asm
@@ -1,7 +1,7 @@
;
; jdclrss2.asm - colorspace conversion (SSE2)
;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
;
; Based on
; x86 SIMD extension for IJG JPEG library
@@ -279,7 +279,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
alignx 16,7
.column_st32:
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
@@ -297,7 +296,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_MMWORD
@@ -331,47 +329,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
test ecx, ecx
jz short .nextrow
mov BYTE [edi], al
-%else
- mov eax,ecx
- xor ecx, byte 0x0F
- shl ecx, 2
- movd xmmB,ecx
- psrlq xmmH,4
- pcmpeqb xmmE,xmmE
- psrlq xmmH,xmmB
- psrlq xmmE,xmmB
- punpcklbw xmmE,xmmH
- ; ----------------
- mov ecx,edi
- and ecx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- add eax,ecx
- cmp eax, byte SIZEOF_XMMWORD
- ja short .adj0
- and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmG,xmmA
- movdqa xmmC,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmD,ecx
- sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmF,ecx
- psllq xmmA,xmmF
- psllq xmmE,xmmF
- jmp short .adj0
-.adj1: neg ecx
- movd xmmF,ecx
- psrlq xmmA,xmmF
- psrlq xmmE,xmmF
- psllq xmmG,xmmD
- psllq xmmC,xmmD
- por xmmA,xmmG
- por xmmE,xmmC
-.adj0: ; ----------------
- movdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -434,7 +391,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
alignx 16,7
.column_st32:
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
@@ -451,7 +407,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_XMMWORD/8
@@ -466,47 +421,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
test ecx, ecx
jz short .nextrow
movd DWORD [edi], xmmA
-%else
- cmp ecx, byte SIZEOF_XMMWORD/16
- jb short .nextrow
- mov eax,ecx
- xor ecx, byte 0x03
- inc ecx
- shl ecx, 4
- movd xmmF,ecx
- psrlq xmmE,xmmF
- punpcklbw xmmE,xmmE
- ; ----------------
- mov ecx,edi
- and ecx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- lea eax, [ecx+eax*4] ; RGB_PIXELSIZE
- cmp eax, byte SIZEOF_XMMWORD
- ja short .adj0
- and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmB,xmmA
- movdqa xmmG,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmC,ecx
- sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmH,ecx
- psllq xmmA,xmmH
- psllq xmmE,xmmH
- jmp short .adj0
-.adj1: neg ecx
- movd xmmH,ecx
- psrlq xmmA,xmmH
- psrlq xmmE,xmmH
- psllq xmmB,xmmC
- psllq xmmG,xmmC
- por xmmA,xmmB
- por xmmE,xmmG
-.adj0: ; ----------------
- movdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
diff --git a/trunk/simd/jdmrgss2-64.asm b/trunk/simd/jdmrgss2-64.asm
index ffe0288..5d8fc46 100644
--- a/trunk/simd/jdmrgss2-64.asm
+++ b/trunk/simd/jdmrgss2-64.asm
@@ -1,7 +1,7 @@
;
; jdmrgss2-64.asm - merged upsampling/color conversion (64-bit SSE2)
;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright 2009 D. R. Commander
;
; Based on
@@ -12,7 +12,7 @@
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ for
+; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; [TAB8]
@@ -288,7 +288,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_MMWORD
@@ -322,47 +321,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
test rcx, rcx
jz short .endcolumn
mov BYTE [rdi], al
-%else
- mov rax,rcx
- xor rcx, byte 0x0F
- shl rcx, 2
- movd xmmB,ecx
- psrlq xmmH,4
- pcmpeqb xmmE,xmmE
- psrlq xmmH,xmmB
- psrlq xmmE,xmmB
- punpcklbw xmmE,xmmH
- ; ----------------
- mov rcx,rdi
- and rcx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- add rax,rcx
- cmp rax, byte SIZEOF_XMMWORD
- ja short .adj0
- and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmG,xmmA
- movdqa xmmC,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmD,ecx
- sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmF,ecx
- psllq xmmA,xmmF
- psllq xmmE,xmmF
- jmp short .adj0
-.adj1: neg rcx
- movd xmmF,ecx
- psrlq xmmA,xmmF
- psrlq xmmE,xmmF
- psllq xmmG,xmmD
- psllq xmmC,xmmD
- por xmmA,xmmG
- por xmmE,xmmC
-.adj0: ; ----------------
- movdqu XMMWORD [rdi],xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -427,7 +385,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
jmp near .columnloop
.column_st32:
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
@@ -444,7 +401,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_XMMWORD/8
@@ -459,47 +415,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
test rcx, rcx
jz short .endcolumn
movd DWORD [rdi], xmmA
-%else
- cmp rcx, byte SIZEOF_XMMWORD/16
- jb near .endcolumn
- mov rax,rcx
- xor rcx, byte 0x03
- inc rcx
- shl rcx, 4
- movd xmmF,ecx
- psrlq xmmE,xmmF
- punpcklbw xmmE,xmmE
- ; ----------------
- mov rcx,rdi
- and rcx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- lea rax, [rcx+rax*4] ; RGB_PIXELSIZE
- cmp rax, byte SIZEOF_XMMWORD
- ja short .adj0
- and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmB,xmmA
- movdqa xmmG,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmC,ecx
- sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmH,ecx
- psllq xmmA,xmmH
- psllq xmmE,xmmH
- jmp short .adj0
-.adj1: neg rcx
- movd xmmH,ecx
- psrlq xmmA,xmmH
- psrlq xmmE,xmmH
- psllq xmmB,xmmC
- psllq xmmG,xmmC
- por xmmA,xmmB
- por xmmE,xmmG
-.adj0: ; ----------------
- movdqu XMMWORD [rdi],xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
diff --git a/trunk/simd/jdmrgss2.asm b/trunk/simd/jdmrgss2.asm
index 556a490..f190468 100644
--- a/trunk/simd/jdmrgss2.asm
+++ b/trunk/simd/jdmrgss2.asm
@@ -1,7 +1,7 @@
;
; jdmrgss2.asm - merged upsampling/color conversion (SSE2)
;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
;
; Based on
; x86 SIMD extension for IJG JPEG library
@@ -284,7 +284,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
alignx 16,7
.column_st32:
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
@@ -302,7 +301,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_MMWORD
@@ -336,47 +334,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
test ecx, ecx
jz short .endcolumn
mov BYTE [edi], al
-%else
- mov eax,ecx
- xor ecx, byte 0x0F
- shl ecx, 2
- movd xmmB,ecx
- psrlq xmmH,4
- pcmpeqb xmmE,xmmE
- psrlq xmmH,xmmB
- psrlq xmmE,xmmB
- punpcklbw xmmE,xmmH
- ; ----------------
- mov ecx,edi
- and ecx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- add eax,ecx
- cmp eax, byte SIZEOF_XMMWORD
- ja short .adj0
- and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmG,xmmA
- movdqa xmmC,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmD,ecx
- sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmF,ecx
- psllq xmmA,xmmF
- psllq xmmE,xmmF
- jmp short .adj0
-.adj1: neg ecx
- movd xmmF,ecx
- psrlq xmmA,xmmF
- psrlq xmmE,xmmF
- psllq xmmG,xmmD
- psllq xmmC,xmmD
- por xmmA,xmmG
- por xmmE,xmmC
-.adj0: ; ----------------
- movdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -442,7 +399,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
alignx 16,7
.column_st32:
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
@@ -459,62 +415,20 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_XMMWORD/8
jb short .column_st7
movq MMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD/2
+ add edi, byte SIZEOF_XMMWORD/8*4
sub ecx, byte SIZEOF_XMMWORD/8
- psrldq xmmA, 64
+ psrldq xmmA, SIZEOF_XMMWORD/8*4
.column_st7:
; Store one pixel (4 bytes) of xmmA to the output when it has enough
; space.
test ecx, ecx
jz short .endcolumn
movd DWORD [edi], xmmA
-%else
- cmp ecx, byte SIZEOF_XMMWORD/16
- jb short .endcolumn
- mov eax,ecx
- xor ecx, byte 0x03
- inc ecx
- shl ecx, 4
- movd xmmF,ecx
- psrlq xmmE,xmmF
- punpcklbw xmmE,xmmE
- ; ----------------
- mov ecx,edi
- and ecx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- lea eax, [ecx+eax*4] ; RGB_PIXELSIZE
- cmp eax, byte SIZEOF_XMMWORD
- ja short .adj0
- and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmB,xmmA
- movdqa xmmG,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmC,ecx
- sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmH,ecx
- psllq xmmA,xmmH
- psllq xmmE,xmmH
- jmp short .adj0
-.adj1: neg ecx
- movd xmmH,ecx
- psrlq xmmA,xmmH
- psrlq xmmE,xmmH
- psllq xmmB,xmmC
- psllq xmmG,xmmC
- por xmmA,xmmB
- por xmmE,xmmG
-.adj0: ; ----------------
- movdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
diff --git a/trunk/simd/jsimdext.inc b/trunk/simd/jsimdext.inc
index 1d4d3e2..253b897 100644
--- a/trunk/simd/jsimdext.inc
+++ b/trunk/simd/jsimdext.inc
@@ -86,8 +86,6 @@ section .note.GNU-stack noalloc noexec nowrite progbits
%define SEG_CONST .rodata progbits alloc noexec nowrite align=16
%endif
-%define STRICT_MEMORY_ACCESS 1
-
; To make the code position-independent, append -DPIC to the commandline
;
%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC