求助,在SSE指令中如何做到循环移动内存呢?或者交换字节次序,帮忙优化下面代码?
我这样的做法,效率太低,有没有高手能够帮助优化下?
int nSize = 16000000;
LPDWORD pSrcData = new DWORD[nSize];
LPDWORD pDesData = new DWORD[nSize];
int i = 0;
for(i = 0; i < nSize; i++)
{
pSrcData[i] = 0xff807060;
pDesData[i] = 0;
}
DWORD dwTick3 = GetTickCount(); // MMX
int nSSESize = nSize / 4;
DWORD xx4[] = { 0x000000ff,0x000000ff,0x000000ff,0x000000ff };
DWORD xx5[] = { 0x00ff0000,0x00ff0000,0x00ff0000,0x00ff0000 };
DWORD xx6[] = { 0xff00ff00,0xff00ff00,0xff00ff00,0xff00ff00 };
__asm
{
mov ecx,nSSESize
mov esi,pSrcData
mov edi,pDesData
movdqu xmm4,[xx4]
movdqu xmm5,[xx5]
movdqu xmm6,[xx6]
__LOOP2:
movdqa xmm0, [esi]
movdqa xmm1, xmm0
psrld xmm1,16 // 0x0000aarr0000aarr
pand xmm1,xmm4
movdqa xmm2,xmm1
movdqa xmm1,xmm0
pslld xmm1,16 // 0xggbb0000ggbb0000
pand xmm1,xmm5
por xmm2,xmm1 // 0x00rr00bb00rr00bb
pand xmm0,xmm6 // 0XAA00GG00AA00GG00
por xmm2,xmm0 // 0xaarrggbbaarrggbb
movdqa [edi],xmm2
add esi,16
add edi,16
dec ecx
jnz __LOOP2
emms
}
dwTick3 = GetTickCount() - dwTick3;