diff options
author | dcommander <dcommander@3789f03b-4d11-0410-bbf8-ca57d06f2519> | 2012-06-13 01:23:09 +0000 |
---|---|---|
committer | dcommander <dcommander@3789f03b-4d11-0410-bbf8-ca57d06f2519> | 2012-06-13 01:23:09 +0000 |
commit | 5861deaf27254e3b18f748765f41d83f50648711 (patch) | |
tree | ae60dc0048529bfadeb14b5bf9a1520df482b95b /trunk | |
parent | a1598e8bfae5d9eb5721a015d36399489ae465f6 (diff) |
Eliminate the use of the MASKMOVDQU instruction, to speed up decompression performance by 10x on AMD Bobcat embedded processors (and ~5% on AMD desktop processors.)
git-svn-id: https://libjpeg-turbo.svn.sourceforge.net/svnroot/libjpeg-turbo@836 3789f03b-4d11-0410-bbf8-ca57d06f2519
Diffstat (limited to 'trunk')
-rw-r--r-- | trunk/ChangeLog.txt | 7 | ||||
-rw-r--r-- | trunk/simd/jdclrss2-64.asm | 49 | ||||
-rw-r--r-- | trunk/simd/jdclrss2.asm | 49 | ||||
-rw-r--r-- | trunk/simd/jdmrgss2-64.asm | 50 | ||||
-rw-r--r-- | trunk/simd/jdmrgss2.asm | 49 |
5 files changed, 83 insertions, 121 deletions
diff --git a/trunk/ChangeLog.txt b/trunk/ChangeLog.txt index d4808e8..5453869 100644 --- a/trunk/ChangeLog.txt +++ b/trunk/ChangeLog.txt @@ -37,6 +37,13 @@ to a large value) would cause libjpeg-turbo to segfault. [8] Extended the TurboJPEG Java API so that it can be used to decompress a JPEG image into an arbitrary position in a large output buffer. +[9] Worked around a severe performance issue with "Bobcat" (AMD Embedded APU) +processors. The MASKMOVDQU instruction, which was used by the libjpeg-turbo +SSE2 SIMD code, is apparently implemented in microcode on AMD processors, and +it is painfully slow on Bobcat processors in particular. Eliminating the use +of this instruction improved performance by an order of magnitude on Bobcat +processors and by a small amount (typically 5%) on AMD desktop processors. + 1.2.0 ===== diff --git a/trunk/simd/jdclrss2-64.asm b/trunk/simd/jdclrss2-64.asm index 696a383..06cb213 100644 --- a/trunk/simd/jdclrss2-64.asm +++ b/trunk/simd/jdclrss2-64.asm @@ -251,17 +251,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2): movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF - add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr jmp short .out0 .out1: ; --(unaligned)----------------- - pcmpeqb xmmH,xmmH ; xmmH=(all 1's) - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF - add rdi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF .out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr sub rcx, byte SIZEOF_XMMWORD jz near .nextrow @@ -275,17 +271,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2): lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE cmp rcx, byte 2*SIZEOF_XMMWORD jb short .column_st16 - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD - add rdi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmF sub rcx, byte 2*SIZEOF_XMMWORD jmp short .column_st15 .column_st16: cmp rcx, byte SIZEOF_XMMWORD jb short .column_st15 - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA add rdi, byte SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmD sub rcx, byte SIZEOF_XMMWORD @@ -363,7 +358,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2): por xmmA,xmmG por xmmE,xmmC .adj0: ; ---------------- - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA + movdqu XMMWORD [rdi], xmmA %endif ; STRICT_MEMORY_ACCESS ; --------------- %else ; RGB_PIXELSIZE == 4 ; ----------- @@ -409,19 +404,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2): movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH - add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr jmp short .out0 .out1: ; --(unaligned)----------------- - pcmpeqb xmmE,xmmE ; xmmE=(all 1's) - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH - add rdi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH .out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr sub rcx, byte SIZEOF_XMMWORD jz near .nextrow @@ -434,17 +424,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2): pcmpeqb xmmE,xmmE ; xmmE=(all 1's) cmp rcx, byte SIZEOF_XMMWORD/2 jb short .column_st16 - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD - add rdi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmC movdqa xmmD,xmmH sub rcx, byte SIZEOF_XMMWORD/2 .column_st16: cmp rcx, byte SIZEOF_XMMWORD/4 jb short .column_st15 - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA add rdi, byte SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmD sub rcx, byte SIZEOF_XMMWORD/4 @@ -503,7 +492,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2): por xmmA,xmmB por xmmE,xmmG .adj0: ; ---------------- - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA + movdqu XMMWORD [rdi], xmmA %endif ; STRICT_MEMORY_ACCESS ; --------------- %endif ; RGB_PIXELSIZE ; --------------- diff --git a/trunk/simd/jdclrss2.asm b/trunk/simd/jdclrss2.asm index 7f519e6..1354c3d 100644 --- a/trunk/simd/jdclrss2.asm +++ b/trunk/simd/jdclrss2.asm @@ -262,17 +262,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2): movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF - add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr jmp short .out0 .out1: ; --(unaligned)----------------- - pcmpeqb xmmH,xmmH ; xmmH=(all 1's) - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF - add edi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF .out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr sub ecx, byte SIZEOF_XMMWORD jz near .nextrow @@ -287,17 +283,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2): lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE cmp ecx, byte 2*SIZEOF_XMMWORD jb short .column_st16 - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD - add edi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmF sub ecx, byte 2*SIZEOF_XMMWORD jmp short .column_st15 .column_st16: cmp ecx, byte SIZEOF_XMMWORD jb short .column_st15 - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA add edi, byte SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmD sub ecx, byte SIZEOF_XMMWORD @@ -375,7 +370,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2): por xmmA,xmmG por xmmE,xmmC .adj0: ; ---------------- - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [edi], xmmA %endif ; STRICT_MEMORY_ACCESS ; --------------- %else ; RGB_PIXELSIZE == 4 ; ----------- @@ -421,19 +416,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2): movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH - add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr jmp short .out0 .out1: ; --(unaligned)----------------- - pcmpeqb xmmE,xmmE ; xmmE=(all 1's) - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH - add edi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH .out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr sub ecx, byte SIZEOF_XMMWORD jz near .nextrow @@ -447,17 +437,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2): pcmpeqb xmmE,xmmE ; xmmE=(all 1's) cmp ecx, byte SIZEOF_XMMWORD/2 jb short .column_st16 - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD - add edi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmC movdqa xmmD,xmmH sub ecx, byte SIZEOF_XMMWORD/2 .column_st16: cmp ecx, byte SIZEOF_XMMWORD/4 jb short .column_st15 - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA add edi, byte SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmD sub ecx, byte SIZEOF_XMMWORD/4 @@ -516,7 +505,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2): por xmmA,xmmB por xmmE,xmmG .adj0: ; ---------------- - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [edi], xmmA %endif ; STRICT_MEMORY_ACCESS ; --------------- %endif ; RGB_PIXELSIZE ; --------------- diff --git a/trunk/simd/jdmrgss2-64.asm b/trunk/simd/jdmrgss2-64.asm index a64a6b3..ffe0288 100644 --- a/trunk/simd/jdmrgss2-64.asm +++ b/trunk/simd/jdmrgss2-64.asm @@ -252,17 +252,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF - add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr jmp short .out0 .out1: ; --(unaligned)----------------- - pcmpeqb xmmH,xmmH ; xmmH=(all 1's) - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF - add rdi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF .out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr sub rcx, byte SIZEOF_XMMWORD jz near .endcolumn @@ -275,21 +271,19 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): jmp near .columnloop .column_st32: - pcmpeqb xmmH,xmmH ; xmmH=(all 1's) lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE cmp rcx, byte 2*SIZEOF_XMMWORD jb short .column_st16 - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD - add rdi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmF sub rcx, byte 2*SIZEOF_XMMWORD jmp short .column_st15 .column_st16: cmp rcx, byte SIZEOF_XMMWORD jb short .column_st15 - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA add rdi, byte SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmD sub rcx, byte SIZEOF_XMMWORD @@ -367,7 +361,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): por xmmA,xmmG por xmmE,xmmC .adj0: ; ---------------- - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [rdi],xmmA %endif ; STRICT_MEMORY_ACCESS ; --------------- %else ; RGB_PIXELSIZE == 4 ; ----------- @@ -413,19 +407,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH - add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr jmp short .out0 .out1: ; --(unaligned)----------------- - pcmpeqb xmmE,xmmE ; xmmE=(all 1's) - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH - add rdi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH .out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr sub rcx, byte SIZEOF_XMMWORD jz near .endcolumn @@ -441,17 +430,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): pcmpeqb xmmE,xmmE ; xmmE=(all 1's) cmp rcx, byte SIZEOF_XMMWORD/2 jb short .column_st16 - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD - add rdi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmC movdqa xmmD,xmmH sub rcx, byte SIZEOF_XMMWORD/2 .column_st16: cmp rcx, byte SIZEOF_XMMWORD/4 jb short .column_st15 - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA add rdi, byte SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmD sub rcx, byte SIZEOF_XMMWORD/4 @@ -510,7 +498,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): por xmmA,xmmB por xmmE,xmmG .adj0: ; ---------------- - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [rdi],xmmA %endif ; STRICT_MEMORY_ACCESS ; --------------- %endif ; RGB_PIXELSIZE ; --------------- diff --git a/trunk/simd/jdmrgss2.asm b/trunk/simd/jdmrgss2.asm index 04089aa..556a490 100644 --- a/trunk/simd/jdmrgss2.asm +++ b/trunk/simd/jdmrgss2.asm @@ -264,17 +264,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF - add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr jmp short .out0 .out1: ; --(unaligned)----------------- - pcmpeqb xmmH,xmmH ; xmmH=(all 1's) - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF - add edi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF .out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr sub ecx, byte SIZEOF_XMMWORD jz near .endcolumn @@ -292,17 +288,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE cmp ecx, byte 2*SIZEOF_XMMWORD jb short .column_st16 - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD - add edi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmF sub ecx, byte 2*SIZEOF_XMMWORD jmp short .column_st15 .column_st16: cmp ecx, byte SIZEOF_XMMWORD jb short .column_st15 - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA add edi, byte SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmD sub ecx, byte SIZEOF_XMMWORD @@ -380,7 +375,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): por xmmA,xmmG por xmmE,xmmC .adj0: ; ---------------- - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [edi], xmmA %endif ; STRICT_MEMORY_ACCESS ; --------------- %else ; RGB_PIXELSIZE == 4 ; ----------- @@ -426,19 +421,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH - add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr jmp short .out0 .out1: ; --(unaligned)----------------- - pcmpeqb xmmE,xmmE ; xmmE=(all 1's) - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH - add edi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH .out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr sub ecx, byte SIZEOF_XMMWORD jz near .endcolumn @@ -455,17 +445,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): pcmpeqb xmmE,xmmE ; xmmE=(all 1's) cmp ecx, byte SIZEOF_XMMWORD/2 jb short .column_st16 - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD - add edi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmC movdqa xmmD,xmmH sub ecx, byte SIZEOF_XMMWORD/2 .column_st16: cmp ecx, byte SIZEOF_XMMWORD/4 jb short .column_st15 - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA add edi, byte SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmD sub ecx, byte SIZEOF_XMMWORD/4 @@ -524,7 +513,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): por xmmA,xmmB por xmmE,xmmG .adj0: ; ---------------- - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [edi], xmmA %endif ; STRICT_MEMORY_ACCESS ; --------------- %endif ; RGB_PIXELSIZE ; --------------- |