aboutsummaryrefslogtreecommitdiff
path: root/simd
diff options
context:
space:
mode:
authordcommander <dcommander@632fc199-4ca6-4c93-a231-07263d6284db>2012-06-13 01:23:09 +0000
committerdcommander <dcommander@632fc199-4ca6-4c93-a231-07263d6284db>2012-06-13 01:23:09 +0000
commit96b5364379693f942462bd9d7710bf004412b249 (patch)
treea1f532ad8171743038deb5e58ec222732558fd57 /simd
parentafcf82954f01493fe1554469a0a236e01c595d22 (diff)
Eliminate the use of the MASKMOVDQU instruction, to speed up decompression performance by 10x on AMD Bobcat embedded processors (and ~5% on AMD desktop processors.)
git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@836 632fc199-4ca6-4c93-a231-07263d6284db
Diffstat (limited to 'simd')
-rw-r--r--simd/jdclrss2-64.asm49
-rw-r--r--simd/jdclrss2.asm49
-rw-r--r--simd/jdmrgss2-64.asm50
-rw-r--r--simd/jdmrgss2.asm49
4 files changed, 76 insertions, 121 deletions
diff --git a/simd/jdclrss2-64.asm b/simd/jdclrss2-64.asm
index 696a383..06cb213 100644
--- a/simd/jdclrss2-64.asm
+++ b/simd/jdclrss2-64.asm
@@ -251,17 +251,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .nextrow
@@ -275,17 +271,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
cmp rcx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub rcx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st15
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD
@@ -363,7 +358,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
por xmmA,xmmG
por xmmE,xmmC
.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
+ movdqu XMMWORD [rdi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -409,19 +404,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .nextrow
@@ -434,17 +424,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub rcx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD/4
jb short .column_st15
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4
@@ -503,7 +492,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
por xmmA,xmmB
por xmmE,xmmG
.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
+ movdqu XMMWORD [rdi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
diff --git a/simd/jdclrss2.asm b/simd/jdclrss2.asm
index 7f519e6..1354c3d 100644
--- a/simd/jdclrss2.asm
+++ b/simd/jdclrss2.asm
@@ -262,17 +262,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .nextrow
@@ -287,17 +283,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD
@@ -375,7 +370,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
por xmmA,xmmG
por xmmE,xmmC
.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -421,19 +416,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .nextrow
@@ -447,17 +437,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub ecx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4
@@ -516,7 +505,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
por xmmA,xmmB
por xmmE,xmmG
.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
diff --git a/simd/jdmrgss2-64.asm b/simd/jdmrgss2-64.asm
index a64a6b3..ffe0288 100644
--- a/simd/jdmrgss2-64.asm
+++ b/simd/jdmrgss2-64.asm
@@ -252,17 +252,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .endcolumn
@@ -275,21 +271,19 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
jmp near .columnloop
.column_st32:
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
cmp rcx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub rcx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st15
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD
@@ -367,7 +361,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
por xmmA,xmmG
por xmmE,xmmC
.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [rdi],xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -413,19 +407,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .endcolumn
@@ -441,17 +430,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub rcx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD/4
jb short .column_st15
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4
@@ -510,7 +498,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
por xmmA,xmmB
por xmmE,xmmG
.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [rdi],xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
diff --git a/simd/jdmrgss2.asm b/simd/jdmrgss2.asm
index 04089aa..556a490 100644
--- a/simd/jdmrgss2.asm
+++ b/simd/jdmrgss2.asm
@@ -264,17 +264,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .endcolumn
@@ -292,17 +288,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD
@@ -380,7 +375,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
por xmmA,xmmG
por xmmE,xmmC
.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -426,19 +421,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .endcolumn
@@ -455,17 +445,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub ecx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4
@@ -524,7 +513,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
por xmmA,xmmB
por xmmE,xmmG
.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------