改天研究一下 貌似GCC生成的汇编效率很渣
程序代码:
// gcc -Wall -O3 -ftracer -fivopts -ftree-loop-linear -ftree-vectorize -fforce-addr -fomit-frame-pointer -fno-bounds-check -funroll-loops -ffast-math -march=native -mfpmath=sse -mmmx -msse -msse2 -msse3 a.c -o a #include <stdio.h> #include <time.h> int main(void) { int i, j, a = 1, b = 1; float c = 1.0, d = 1.0; double e = 1.0, f = 1.0; double start, finish, duration; start = clock(); for (i = 0; i < 1000; i++) { for (j = 0; j < 1000000; j++) { a = a + 50; b = a - 100; a = b * 20; c = a + 300.89; d = c - 600.89; c = d * 90.89; d = c / 55.89; e = c * 90.89; f = e / 55.89; } } finish = clock(); duration = finish - start; printf("%f, %f\n", e, f); printf("%10e\n", duration); return 0; }
程序代码:
0x0000000000402c50 <+0>: push rbx 0x0000000000402c51 <+1>: sub rsp,0x40 0x0000000000402c55 <+5>: vmovaps XMMWORD PTR [rsp+0x20],xmm6 0x0000000000402c5b <+11>: vmovaps XMMWORD PTR [rsp+0x30],xmm7 0x0000000000402c61 <+17>: call 0x4016a0 <__main> 0x0000000000402c66 <+22>: call 0x402ae8 <clock> 0x0000000000402c6b <+27>: vxorpd xmm7,xmm7,xmm7 0x0000000000402c6f <+31>: mov ecx,0x3e8 0x0000000000402c74 <+36>: mov r8d,0x1 0x0000000000402c7a <+42>: vcvtsi2sd xmm7,xmm7,eax 0x0000000000402c7e <+46>: xchg ax,ax 0x0000000000402c80 <+48>: mov edx,0xf4240 0x0000000000402c85 <+53>: lea eax,[r8+r8*4-0xfa] 0x0000000000402c8d <+61>: lea ebx,[rax*4-0x32] 0x0000000000402c94 <+68>: lea r8d,[rbx+rbx*4] 0x0000000000402c98 <+72>: lea r9d,[r8*4-0x32] 0x0000000000402ca0 <+80>: lea r10d,[r9+r9*4] 0x0000000000402ca4 <+84>: lea r11d,[r10*4-0x32] 0x0000000000402cac <+92>: lea eax,[r11+r11*4] 0x0000000000402cb0 <+96>: lea ebx,[rax*4-0x32] 0x0000000000402cb7 <+103>: lea r8d,[rbx+rbx*4] 0x0000000000402cbb <+107>: lea r9d,[r8*4-0x32] 0x0000000000402cc3 <+115>: lea r10d,[r9+r9*4] 0x0000000000402cc7 <+119>: lea r11d,[r10*4-0x32] 0x0000000000402ccf <+127>: lea eax,[r11+r11*4] 0x0000000000402cd3 <+131>: lea ebx,[rax*4-0x32] 0x0000000000402cda <+138>: lea r8d,[rbx+rbx*4] 0x0000000000402cde <+142>: shl r8d,0x2 0x0000000000402ce2 <+146>: sub edx,0x8 0x0000000000402ce5 <+149>: jne 0x402c85 <main+53> 0x0000000000402ce7 <+151>: sub ecx,0x1 0x0000000000402cea <+154>: jne 0x402c80 <main+48> 0x0000000000402cec <+156>: vxorpd xmm0,xmm0,xmm0 0x0000000000402cf0 <+160>: vcvtsi2sd xmm1,xmm0,r8d 0x0000000000402cf5 <+165>: vaddsd xmm2,xmm1,QWORD PTR [rip+0x1313] # 0x404010 0x0000000000402cfd <+173>: vmovsd xmm1,QWORD PTR [rip+0x131b] # 0x404020 0x0000000000402d05 <+181>: vcvtsd2ss xmm3,xmm3,xmm2 0x0000000000402d09 <+185>: vcvtss2sd xmm4,xmm4,xmm3 0x0000000000402d0d <+189>: vsubsd xmm5,xmm4,QWORD PTR [rip+0x1303] # 0x404018 0x0000000000402d15 <+197>: vcvtsd2ss xmm6,xmm6,xmm5 0x0000000000402d19 <+201>: vcvtss2sd xmm0,xmm0,xmm6 0x0000000000402d1d <+205>: vmulsd xmm2,xmm0,xmm1 0x0000000000402d21 <+209>: vcvtsd2ss xmm3,xmm3,xmm2 0x0000000000402d25 <+213>: vcvtss2sd xmm4,xmm4,xmm3 0x0000000000402d29 <+217>: vmulsd xmm6,xmm4,xmm1 0x0000000000402d2d <+221>: call 0x402ae8 <clock> 0x0000000000402d32 <+226>: lea rcx,[rip+0x12c7] # 0x404000 0x0000000000402d39 <+233>: vmulsd xmm5,xmm6,QWORD PTR [rip+0x12e7] # 0x404028 0x0000000000402d41 <+241>: mov ebx,eax 0x0000000000402d43 <+243>: vmovapd xmm1,xmm6 0x0000000000402d47 <+247>: vmovq rdx,xmm6 0x0000000000402d4c <+252>: vmovapd xmm2,xmm5 0x0000000000402d50 <+256>: vmovq r8,xmm5 0x0000000000402d55 <+261>: call 0x402ab0 <printf> 0x0000000000402d5a <+266>: vxorpd xmm0,xmm0,xmm0 0x0000000000402d5e <+270>: lea rcx,[rip+0x12a3] # 0x404008 0x0000000000402d65 <+277>: vcvtsi2sd xmm1,xmm0,ebx 0x0000000000402d69 <+281>: vsubsd xmm7,xmm1,xmm7 0x0000000000402d6d <+285>: vmovapd xmm1,xmm7 0x0000000000402d71 <+289>: vmovq rdx,xmm7 0x0000000000402d76 <+294>: call 0x402ab0 <printf> 0x0000000000402d7b <+299>: nop 0x0000000000402d7c <+300>: vmovaps xmm6,XMMWORD PTR [rsp+0x20] 0x0000000000402d82 <+306>: xor eax,eax 0x0000000000402d84 <+308>: vmovaps xmm7,XMMWORD PTR [rsp+0x30] 0x0000000000402d8a <+314>: add rsp,0x40 0x0000000000402d8e <+318>: pop rbx 0x0000000000402d8f <+319>: ret