summaryrefslogtreecommitdiff
path: root/trunk
diff options
context:
space:
mode:
authordcommander <dcommander@3789f03b-4d11-0410-bbf8-ca57d06f2519>2012-06-13 01:23:09 +0000
committerdcommander <dcommander@3789f03b-4d11-0410-bbf8-ca57d06f2519>2012-06-13 01:23:09 +0000
commit5861deaf27254e3b18f748765f41d83f50648711 (patch)
treeae60dc0048529bfadeb14b5bf9a1520df482b95b /trunk
parenta1598e8bfae5d9eb5721a015d36399489ae465f6 (diff)
Eliminate the use of the MASKMOVDQU instruction, to speed up decompression performance by 10x on AMD Bobcat embedded processors (and ~5% on AMD desktop processors.)
git-svn-id: https://libjpeg-turbo.svn.sourceforge.net/svnroot/libjpeg-turbo@836 3789f03b-4d11-0410-bbf8-ca57d06f2519
Diffstat (limited to 'trunk')
-rw-r--r--trunk/ChangeLog.txt7
-rw-r--r--trunk/simd/jdclrss2-64.asm49
-rw-r--r--trunk/simd/jdclrss2.asm49
-rw-r--r--trunk/simd/jdmrgss2-64.asm50
-rw-r--r--trunk/simd/jdmrgss2.asm49
5 files changed, 83 insertions, 121 deletions
diff --git a/trunk/ChangeLog.txt b/trunk/ChangeLog.txt
index d4808e8..5453869 100644
--- a/trunk/ChangeLog.txt
+++ b/trunk/ChangeLog.txt
@@ -37,6 +37,13 @@ to a large value) would cause libjpeg-turbo to segfault.
[8] Extended the TurboJPEG Java API so that it can be used to decompress a
JPEG image into an arbitrary position in a large output buffer.
+[9] Worked around a severe performance issue with "Bobcat" (AMD Embedded APU)
+processors. The MASKMOVDQU instruction, which was used by the libjpeg-turbo
+SSE2 SIMD code, is apparently implemented in microcode on AMD processors, and
+it is painfully slow on Bobcat processors in particular. Eliminating the use
+of this instruction improved performance by an order of magnitude on Bobcat
+processors and by a small amount (typically 5%) on AMD desktop processors.
+
1.2.0
=====
diff --git a/trunk/simd/jdclrss2-64.asm b/trunk/simd/jdclrss2-64.asm
index 696a383..06cb213 100644
--- a/trunk/simd/jdclrss2-64.asm
+++ b/trunk/simd/jdclrss2-64.asm
@@ -251,17 +251,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .nextrow
@@ -275,17 +271,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
cmp rcx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub rcx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st15
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD
@@ -363,7 +358,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
por xmmA,xmmG
por xmmE,xmmC
.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
+ movdqu XMMWORD [rdi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -409,19 +404,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .nextrow
@@ -434,17 +424,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub rcx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD/4
jb short .column_st15
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4
@@ -503,7 +492,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
por xmmA,xmmB
por xmmE,xmmG
.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
+ movdqu XMMWORD [rdi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
diff --git a/trunk/simd/jdclrss2.asm b/trunk/simd/jdclrss2.asm
index 7f519e6..1354c3d 100644
--- a/trunk/simd/jdclrss2.asm
+++ b/trunk/simd/jdclrss2.asm
@@ -262,17 +262,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .nextrow
@@ -287,17 +283,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD
@@ -375,7 +370,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
por xmmA,xmmG
por xmmE,xmmC
.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -421,19 +416,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .nextrow
@@ -447,17 +437,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub ecx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4
@@ -516,7 +505,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
por xmmA,xmmB
por xmmE,xmmG
.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
diff --git a/trunk/simd/jdmrgss2-64.asm b/trunk/simd/jdmrgss2-64.asm
index a64a6b3..ffe0288 100644
--- a/trunk/simd/jdmrgss2-64.asm
+++ b/trunk/simd/jdmrgss2-64.asm
@@ -252,17 +252,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .endcolumn
@@ -275,21 +271,19 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
jmp near .columnloop
.column_st32:
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
cmp rcx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub rcx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st15
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD
@@ -367,7 +361,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
por xmmA,xmmG
por xmmE,xmmC
.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [rdi],xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -413,19 +407,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .endcolumn
@@ -441,17 +430,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub rcx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD/4
jb short .column_st15
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4
@@ -510,7 +498,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
por xmmA,xmmB
por xmmE,xmmG
.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [rdi],xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
diff --git a/trunk/simd/jdmrgss2.asm b/trunk/simd/jdmrgss2.asm
index 04089aa..556a490 100644
--- a/trunk/simd/jdmrgss2.asm
+++ b/trunk/simd/jdmrgss2.asm
@@ -264,17 +264,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .endcolumn
@@ -292,17 +288,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD
@@ -380,7 +375,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
por xmmA,xmmG
por xmmE,xmmC
.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -426,19 +421,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .endcolumn
@@ -455,17 +445,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub ecx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4
@@ -524,7 +513,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
por xmmA,xmmB
por xmmE,xmmG
.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------