#2
wfoo2015-06-23 15:31
gcc -O2 -msse3 -S a.c -m64
程序代码: .file "a.c" .section .rodata.str1.1,"aMS",@progbits,1 .LC4: .string "%f, %f\n" .LC5: .string "%10e\n" .section .text.unlikely,"ax",@progbits .LCOLDB6: .section .text.startup,"ax",@progbits .LHOTB6: .p2align 4,,15 .globl main .type main, @function main: .LFB11: .cfi_startproc pushq %rbx .cfi_def_cfa_offset 16 .cfi_offset 3, -16 subq $16, %rsp .cfi_def_cfa_offset 32 call clock pxor %xmm2, %xmm2 movl $1000, %ecx cvtsi2sdq %rax, %xmm2 movl $1, %eax movsd %xmm2, (%rsp) .p2align 4,,10 .p2align 3 .L2: movl $1000000, %edx .p2align 4,,10 .p2align 3 .L3: leal -250(%rax,%rax,4), %eax sall $2, %eax subl $1, %edx jne .L3 subl $1, %ecx jne .L2 pxor %xmm0, %xmm0 movsd .LC2(%rip), %xmm1 cvtsi2sd %eax, %xmm0 addsd .LC0(%rip), %xmm0 cvtsd2ss %xmm0, %xmm0 cvtss2sd %xmm0, %xmm0 subsd .LC1(%rip), %xmm0 cvtsd2ss %xmm0, %xmm0 cvtss2sd %xmm0, %xmm0 mulsd %xmm1, %xmm0 cvtsd2ss %xmm0, %xmm0 cvtss2sd %xmm0, %xmm0 mulsd %xmm1, %xmm0 movsd %xmm0, 8(%rsp) call clock movsd 8(%rsp), %xmm0 movq %rax, %rbx movl $.LC4, %edi movl $2, %eax movapd %xmm0, %xmm1 divsd .LC3(%rip), %xmm1 call printf pxor %xmm0, %xmm0 movl $.LC5, %edi movl $1, %eax cvtsi2sdq %rbx, %xmm0 subsd (%rsp), %xmm0 call printf addq $16, %rsp .cfi_def_cfa_offset 16 xorl %eax, %eax popq %rbx .cfi_def_cfa_offset 8 ret .cfi_endproc .LFE11: .size main, .-main .section .text.unlikely .LCOLDE6: .section .text.startup .LHOTE6: .section .rodata.cst8,"aM",@progbits,8 .align 8 .LC0: .long 1889785610 .long 1081265725 .align 8 .LC1: .long -1202590843 .long 1082312478 .align 8 .LC2: .long -1030792151 .long 1079425269 .align 8 .LC3: .long -2061584302 .long 1078718955 .ident "GCC: (Debian 4.9.2-10) 4.9.2" .section .note.GNU-stack,"",@progbits 用同样的参数clang -O2 -msse3 -S a.c -m64,循环部分感觉要好些。 不过gcc是4.9,可能用5.1的优化会好些。 程序代码: .text .file "a.c" .section .rodata.cst8,"aM",@progbits,8 .align 8 .LCPI0_0: .quad 4644000929050515210 # double 300.88999999999999 .LCPI0_1: .quad -4574875336699679867 # double -600.88999999999999 .LCPI0_2: .quad 4636096232095177769 # double 90.89 .LCPI0_3: .quad 4633062635533678674 # double 55.890000000000001 .text .globl main .align 16, 0x90 .type main,@function main: # @main .cfi_startproc # BB#0: pushq %rbp .Ltmp0: .cfi_def_cfa_offset 16 pushq %rbx .Ltmp1: .cfi_def_cfa_offset 24 subq $24, %rsp .Ltmp2: .cfi_def_cfa_offset 48 .Ltmp3: .cfi_offset %rbx, -24 .Ltmp4: .cfi_offset %rbp, -16 movl $1, %ebx xorl %ebp, %ebp callq clock .align 16, 0x90 .LBB0_1: # %.preheader # =>This Loop Header: Depth=1 # Child Loop BB0_2 Depth 2 movl $1000000, %ecx # imm = 0xF4240 .align 16, 0x90 .LBB0_2: # Parent Loop BB0_1 Depth=1 # => This Inner Loop Header: Depth=2 imull $3200000, %ebx # imm = 0x30D400 addl $-168421000, %ebx # imm = 0xFFFFFFFFF5F61978 addl $-5, %ecx jne .LBB0_2 # BB#3: # in Loop: Header=BB0_1 Depth=1 incl %ebp cmpl $1000, %ebp # imm = 0x3E8 jne .LBB0_1 # BB#4: cvtsi2sdq %rax, %xmm0 movsd %xmm0, 16(%rsp) # 8-byte Spill xorps %xmm0, %xmm0 cvtsi2sdl %ebx, %xmm0 addsd .LCPI0_0(%rip), %xmm0 cvtsd2ss %xmm0, %xmm0 cvtss2sd %xmm0, %xmm0 addsd .LCPI0_1(%rip), %xmm0 cvtsd2ss %xmm0, %xmm0 cvtss2sd %xmm0, %xmm0 movsd .LCPI0_2(%rip), %xmm1 mulsd %xmm1, %xmm0 cvtsd2ss %xmm0, %xmm0 cvtss2sd %xmm0, %xmm0 mulsd %xmm1, %xmm0 movsd %xmm0, 8(%rsp) # 8-byte Spill divsd .LCPI0_3(%rip), %xmm0 movsd %xmm0, (%rsp) # 8-byte Spill callq clock xorps %xmm0, %xmm0 cvtsi2sdq %rax, %xmm0 subsd 16(%rsp), %xmm0 # 8-byte Folded Reload movsd %xmm0, 16(%rsp) # 8-byte Spill movl $.L.str, %edi movb $2, %al movsd 8(%rsp), %xmm0 # 8-byte Reload movsd (%rsp), %xmm1 # 8-byte Reload callq printf movl $.L.str1, %edi movb $1, %al movsd 16(%rsp), %xmm0 # 8-byte Reload callq printf xorl %eax, %eax addq $24, %rsp popq %rbx popq %rbp retq .Ltmp5: .size main, .Ltmp5-main .cfi_endproc .type .L.str,@object # @.str .section .rodata.str1.1,"aMS",@progbits,1 .L.str: .asciz "%f, %f\n" .size .L.str, 8 .type .L.str1,@object # @.str1 .L.str1: .asciz "%10e\n" .size .L.str1, 6 .ident "Debian clang version 3.5.0-10 (tags/RELEASE_350/final) (based on LLVM 3.5.0)" .section ".note.GNU-stack","",@progbits [ 本帖最后由 wfoo 于 2015-6-23 15:35 编辑 ] |
程序代码:
// gcc -Wall -O3 -ftracer -fivopts -ftree-loop-linear -ftree-vectorize -fforce-addr -fomit-frame-pointer -fno-bounds-check -funroll-loops -ffast-math -march=native -mfpmath=sse -mmmx -msse -msse2 -msse3 a.c -o a
#include <stdio.h>
#include <time.h>
int main(void)
{
int i, j, a = 1, b = 1;
float c = 1.0, d = 1.0;
double e = 1.0, f = 1.0;
double start, finish, duration;
start = clock();
for (i = 0; i < 1000; i++)
{
for (j = 0; j < 1000000; j++)
{
a = a + 50;
b = a - 100;
a = b * 20;
c = a + 300.89;
d = c - 600.89;
c = d * 90.89;
d = c / 55.89;
e = c * 90.89;
f = e / 55.89;
}
}
finish = clock();
duration = finish - start;
printf("%f, %f\n", e, f);
printf("%10e\n", duration);
return 0;
}
程序代码:
0x0000000000402c50 <+0>: push rbx
0x0000000000402c51 <+1>: sub rsp,0x40
0x0000000000402c55 <+5>: vmovaps XMMWORD PTR [rsp+0x20],xmm6
0x0000000000402c5b <+11>: vmovaps XMMWORD PTR [rsp+0x30],xmm7
0x0000000000402c61 <+17>: call 0x4016a0 <__main>
0x0000000000402c66 <+22>: call 0x402ae8 <clock>
0x0000000000402c6b <+27>: vxorpd xmm7,xmm7,xmm7
0x0000000000402c6f <+31>: mov ecx,0x3e8
0x0000000000402c74 <+36>: mov r8d,0x1
0x0000000000402c7a <+42>: vcvtsi2sd xmm7,xmm7,eax
0x0000000000402c7e <+46>: xchg ax,ax
0x0000000000402c80 <+48>: mov edx,0xf4240
0x0000000000402c85 <+53>: lea eax,[r8+r8*4-0xfa]
0x0000000000402c8d <+61>: lea ebx,[rax*4-0x32]
0x0000000000402c94 <+68>: lea r8d,[rbx+rbx*4]
0x0000000000402c98 <+72>: lea r9d,[r8*4-0x32]
0x0000000000402ca0 <+80>: lea r10d,[r9+r9*4]
0x0000000000402ca4 <+84>: lea r11d,[r10*4-0x32]
0x0000000000402cac <+92>: lea eax,[r11+r11*4]
0x0000000000402cb0 <+96>: lea ebx,[rax*4-0x32]
0x0000000000402cb7 <+103>: lea r8d,[rbx+rbx*4]
0x0000000000402cbb <+107>: lea r9d,[r8*4-0x32]
0x0000000000402cc3 <+115>: lea r10d,[r9+r9*4]
0x0000000000402cc7 <+119>: lea r11d,[r10*4-0x32]
0x0000000000402ccf <+127>: lea eax,[r11+r11*4]
0x0000000000402cd3 <+131>: lea ebx,[rax*4-0x32]
0x0000000000402cda <+138>: lea r8d,[rbx+rbx*4]
0x0000000000402cde <+142>: shl r8d,0x2
0x0000000000402ce2 <+146>: sub edx,0x8
0x0000000000402ce5 <+149>: jne 0x402c85 <main+53>
0x0000000000402ce7 <+151>: sub ecx,0x1
0x0000000000402cea <+154>: jne 0x402c80 <main+48>
0x0000000000402cec <+156>: vxorpd xmm0,xmm0,xmm0
0x0000000000402cf0 <+160>: vcvtsi2sd xmm1,xmm0,r8d
0x0000000000402cf5 <+165>: vaddsd xmm2,xmm1,QWORD PTR [rip+0x1313] # 0x404010
0x0000000000402cfd <+173>: vmovsd xmm1,QWORD PTR [rip+0x131b] # 0x404020
0x0000000000402d05 <+181>: vcvtsd2ss xmm3,xmm3,xmm2
0x0000000000402d09 <+185>: vcvtss2sd xmm4,xmm4,xmm3
0x0000000000402d0d <+189>: vsubsd xmm5,xmm4,QWORD PTR [rip+0x1303] # 0x404018
0x0000000000402d15 <+197>: vcvtsd2ss xmm6,xmm6,xmm5
0x0000000000402d19 <+201>: vcvtss2sd xmm0,xmm0,xmm6
0x0000000000402d1d <+205>: vmulsd xmm2,xmm0,xmm1
0x0000000000402d21 <+209>: vcvtsd2ss xmm3,xmm3,xmm2
0x0000000000402d25 <+213>: vcvtss2sd xmm4,xmm4,xmm3
0x0000000000402d29 <+217>: vmulsd xmm6,xmm4,xmm1
0x0000000000402d2d <+221>: call 0x402ae8 <clock>
0x0000000000402d32 <+226>: lea rcx,[rip+0x12c7] # 0x404000
0x0000000000402d39 <+233>: vmulsd xmm5,xmm6,QWORD PTR [rip+0x12e7] # 0x404028
0x0000000000402d41 <+241>: mov ebx,eax
0x0000000000402d43 <+243>: vmovapd xmm1,xmm6
0x0000000000402d47 <+247>: vmovq rdx,xmm6
0x0000000000402d4c <+252>: vmovapd xmm2,xmm5
0x0000000000402d50 <+256>: vmovq r8,xmm5
0x0000000000402d55 <+261>: call 0x402ab0 <printf>
0x0000000000402d5a <+266>: vxorpd xmm0,xmm0,xmm0
0x0000000000402d5e <+270>: lea rcx,[rip+0x12a3] # 0x404008
0x0000000000402d65 <+277>: vcvtsi2sd xmm1,xmm0,ebx
0x0000000000402d69 <+281>: vsubsd xmm7,xmm1,xmm7
0x0000000000402d6d <+285>: vmovapd xmm1,xmm7
0x0000000000402d71 <+289>: vmovq rdx,xmm7
0x0000000000402d76 <+294>: call 0x402ab0 <printf>
0x0000000000402d7b <+299>: nop
0x0000000000402d7c <+300>: vmovaps xmm6,XMMWORD PTR [rsp+0x20]
0x0000000000402d82 <+306>: xor eax,eax
0x0000000000402d84 <+308>: vmovaps xmm7,XMMWORD PTR [rsp+0x30]
0x0000000000402d8a <+314>: add rsp,0x40
0x0000000000402d8e <+318>: pop rbx
0x0000000000402d8f <+319>: ret