| 网站首页 | 业界新闻 | 小组 | 威客 | 人才 | 下载频道 | 博客 | 代码贴 | 在线编程 | 编程论坛
欢迎加入我们,一同切磋技术
用户名:   
 
密 码:  
共有 4993 人关注过本帖
标题:改天研究一下 貌似GCC生成的汇编效率很渣
只看楼主 加入收藏
zklhp
Rank: 20Rank: 20Rank: 20Rank: 20Rank: 20
来 自:china
等 级:贵宾
威 望:254
帖 子:11485
专家分:33241
注 册:2007-7-10
结帖率:100%
收藏
 问题点数:0 回复次数:1 
改天研究一下 貌似GCC生成的汇编效率很渣
程序代码:
// gcc -Wall -O3 -ftracer -fivopts -ftree-loop-linear -ftree-vectorize -fforce-addr -fomit-frame-pointer -fno-bounds-check -funroll-loops -ffast-math -march=native -mfpmath=sse -mmmx -msse -msse2 -msse3 a.c -o a
#include <stdio.h>
#include <time.h>

int main(void)
{
    int i, j, a = 1, b = 1;
    float c = 1.0, d = 1.0;
    double e = 1.0, f = 1.0;
    double start, finish, duration;
    start = clock();

    for (i = 0; i < 1000; i++)
    {
        for (j = 0; j < 1000000; j++)
        {
            a = a + 50;
            b = a - 100;
            a = b * 20;
            c = a + 300.89;
            d = c - 600.89;
            c = d * 90.89;
            d = c / 55.89;
            e = c * 90.89;
            f = e / 55.89;
        }
    }

    finish = clock();
    duration = finish - start;
    printf("%f, %f\n", e, f);
    printf("%10e\n", duration);
    return 0;
}


程序代码:
   0x0000000000402c50 <+0>:    push   rbx
   0x0000000000402c51 <+1>:    sub    rsp,0x40
   0x0000000000402c55 <+5>:    vmovaps XMMWORD PTR [rsp+0x20],xmm6
   0x0000000000402c5b <+11>:    vmovaps XMMWORD PTR [rsp+0x30],xmm7
   0x0000000000402c61 <+17>:    call   0x4016a0 <__main>
   0x0000000000402c66 <+22>:    call   0x402ae8 <clock>
   0x0000000000402c6b <+27>:    vxorpd xmm7,xmm7,xmm7
   0x0000000000402c6f <+31>:    mov    ecx,0x3e8
   0x0000000000402c74 <+36>:    mov    r8d,0x1
   0x0000000000402c7a <+42>:    vcvtsi2sd xmm7,xmm7,eax
   0x0000000000402c7e <+46>:    xchg   ax,ax
   0x0000000000402c80 <+48>:    mov    edx,0xf4240
   0x0000000000402c85 <+53>:    lea    eax,[r8+r8*4-0xfa]
   0x0000000000402c8d <+61>:    lea    ebx,[rax*4-0x32]
   0x0000000000402c94 <+68>:    lea    r8d,[rbx+rbx*4]
   0x0000000000402c98 <+72>:    lea    r9d,[r8*4-0x32]
   0x0000000000402ca0 <+80>:    lea    r10d,[r9+r9*4]
   0x0000000000402ca4 <+84>:    lea    r11d,[r10*4-0x32]
   0x0000000000402cac <+92>:    lea    eax,[r11+r11*4]
   0x0000000000402cb0 <+96>:    lea    ebx,[rax*4-0x32]
   0x0000000000402cb7 <+103>:    lea    r8d,[rbx+rbx*4]
   0x0000000000402cbb <+107>:    lea    r9d,[r8*4-0x32]
   0x0000000000402cc3 <+115>:    lea    r10d,[r9+r9*4]
   0x0000000000402cc7 <+119>:    lea    r11d,[r10*4-0x32]
   0x0000000000402ccf <+127>:    lea    eax,[r11+r11*4]
   0x0000000000402cd3 <+131>:    lea    ebx,[rax*4-0x32]
   0x0000000000402cda <+138>:    lea    r8d,[rbx+rbx*4]
   0x0000000000402cde <+142>:    shl    r8d,0x2
   0x0000000000402ce2 <+146>:    sub    edx,0x8
   0x0000000000402ce5 <+149>:    jne    0x402c85 <main+53>
   0x0000000000402ce7 <+151>:    sub    ecx,0x1
   0x0000000000402cea <+154>:    jne    0x402c80 <main+48>
   0x0000000000402cec <+156>:    vxorpd xmm0,xmm0,xmm0
   0x0000000000402cf0 <+160>:    vcvtsi2sd xmm1,xmm0,r8d
   0x0000000000402cf5 <+165>:    vaddsd xmm2,xmm1,QWORD PTR [rip+0x1313]        # 0x404010
   0x0000000000402cfd <+173>:    vmovsd xmm1,QWORD PTR [rip+0x131b]        # 0x404020
   0x0000000000402d05 <+181>:    vcvtsd2ss xmm3,xmm3,xmm2
   0x0000000000402d09 <+185>:    vcvtss2sd xmm4,xmm4,xmm3
   0x0000000000402d0d <+189>:    vsubsd xmm5,xmm4,QWORD PTR [rip+0x1303]        # 0x404018
   0x0000000000402d15 <+197>:    vcvtsd2ss xmm6,xmm6,xmm5
   0x0000000000402d19 <+201>:    vcvtss2sd xmm0,xmm0,xmm6
   0x0000000000402d1d <+205>:    vmulsd xmm2,xmm0,xmm1
   0x0000000000402d21 <+209>:    vcvtsd2ss xmm3,xmm3,xmm2
   0x0000000000402d25 <+213>:    vcvtss2sd xmm4,xmm4,xmm3
   0x0000000000402d29 <+217>:    vmulsd xmm6,xmm4,xmm1
   0x0000000000402d2d <+221>:    call   0x402ae8 <clock>
   0x0000000000402d32 <+226>:    lea    rcx,[rip+0x12c7]        # 0x404000
   0x0000000000402d39 <+233>:    vmulsd xmm5,xmm6,QWORD PTR [rip+0x12e7]        # 0x404028
   0x0000000000402d41 <+241>:    mov    ebx,eax
   0x0000000000402d43 <+243>:    vmovapd xmm1,xmm6
   0x0000000000402d47 <+247>:    vmovq  rdx,xmm6
   0x0000000000402d4c <+252>:    vmovapd xmm2,xmm5
   0x0000000000402d50 <+256>:    vmovq  r8,xmm5
   0x0000000000402d55 <+261>:    call   0x402ab0 <printf>
   0x0000000000402d5a <+266>:    vxorpd xmm0,xmm0,xmm0
   0x0000000000402d5e <+270>:    lea    rcx,[rip+0x12a3]        # 0x404008
   0x0000000000402d65 <+277>:    vcvtsi2sd xmm1,xmm0,ebx
   0x0000000000402d69 <+281>:    vsubsd xmm7,xmm1,xmm7
   0x0000000000402d6d <+285>:    vmovapd xmm1,xmm7
   0x0000000000402d71 <+289>:    vmovq  rdx,xmm7
   0x0000000000402d76 <+294>:    call   0x402ab0 <printf>
   0x0000000000402d7b <+299>:    nop
   0x0000000000402d7c <+300>:    vmovaps xmm6,XMMWORD PTR [rsp+0x20]
   0x0000000000402d82 <+306>:    xor    eax,eax
   0x0000000000402d84 <+308>:    vmovaps xmm7,XMMWORD PTR [rsp+0x30]
   0x0000000000402d8a <+314>:    add    rsp,0x40
   0x0000000000402d8e <+318>:    pop    rbx
   0x0000000000402d8f <+319>:    ret

搜索更多相关主题的帖子: include color 
2015-06-23 14:14
wfoo
Rank: 3Rank: 3
等 级:论坛游侠
威 望:7
帖 子:120
专家分:134
注 册:2011-8-6
收藏
得分:0 
gcc -O2  -msse3  -S a.c -m64

程序代码:
    .file    "a.c"
    .section    .rodata.str1.1,"aMS",@progbits,1
.LC4:
    .string    "%f, %f\n"
.LC5:
    .string    "%10e\n"
    .section    .text.unlikely,"ax",@progbits
.LCOLDB6:
    .section    .text.startup,"ax",@progbits
.LHOTB6:
    .p2align 4,,15
    .globl    main
    .type    main, @function
main:
.LFB11:
    .cfi_startproc
    pushq    %rbx
    .cfi_def_cfa_offset 16
    .cfi_offset 3, -16
    subq    $16, %rsp
    .cfi_def_cfa_offset 32
    call    clock
    pxor    %xmm2, %xmm2
    movl    $1000, %ecx
    cvtsi2sdq    %rax, %xmm2
    movl    $1, %eax
    movsd    %xmm2, (%rsp)
    .p2align 4,,10
    .p2align 3
.L2:
    movl    $1000000, %edx
    .p2align 4,,10
    .p2align 3
.L3:
    leal    -250(%rax,%rax,4), %eax
    sall    $2, %eax
    subl    $1, %edx
    jne    .L3
    subl    $1, %ecx
    jne    .L2
    pxor    %xmm0, %xmm0
    movsd    .LC2(%rip), %xmm1
    cvtsi2sd    %eax, %xmm0
    addsd    .LC0(%rip), %xmm0
    cvtsd2ss    %xmm0, %xmm0
    cvtss2sd    %xmm0, %xmm0
    subsd    .LC1(%rip), %xmm0
    cvtsd2ss    %xmm0, %xmm0
    cvtss2sd    %xmm0, %xmm0
    mulsd    %xmm1, %xmm0
    cvtsd2ss    %xmm0, %xmm0
    cvtss2sd    %xmm0, %xmm0
    mulsd    %xmm1, %xmm0
    movsd    %xmm0, 8(%rsp)
    call    clock
    movsd    8(%rsp), %xmm0
    movq    %rax, %rbx
    movl    $.LC4, %edi
    movl    $2, %eax
    movapd    %xmm0, %xmm1
    divsd    .LC3(%rip), %xmm1
    call    printf
    pxor    %xmm0, %xmm0
    movl    $.LC5, %edi
    movl    $1, %eax
    cvtsi2sdq    %rbx, %xmm0
    subsd    (%rsp), %xmm0
    call    printf
    addq    $16, %rsp
    .cfi_def_cfa_offset 16
    xorl    %eax, %eax
    popq    %rbx
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc
.LFE11:
    .size    main, .-main
    .section    .text.unlikely
.LCOLDE6:
    .section    .text.startup
.LHOTE6:
    .section    .rodata.cst8,"aM",@progbits,8
    .align 8
.LC0:
    .long    1889785610
    .long    1081265725
    .align 8
.LC1:
    .long    -1202590843
    .long    1082312478
    .align 8
.LC2:
    .long    -1030792151
    .long    1079425269
    .align 8
.LC3:
    .long    -2061584302
    .long    1078718955
    .ident    "GCC: (Debian 4.9.2-10) 4.9.2"
    .section    .note.GNU-stack,"",@progbits


用同样的参数clang -O2 -msse3  -S a.c -m64,循环部分感觉要好些。
不过gcc是4.9,可能用5.1的优化会好些。
程序代码:
    .text
    .file    "a.c"
    .section    .rodata.cst8,"aM",@progbits,8
    .align    8
.LCPI0_0:
    .quad    4644000929050515210     # double 300.88999999999999
.LCPI0_1:
    .quad    -4574875336699679867    # double -600.88999999999999
.LCPI0_2:
    .quad    4636096232095177769     # double 90.89
.LCPI0_3:
    .quad    4633062635533678674     # double 55.890000000000001
    .text
    .globl    main
    .align    16, 0x90
    .type    main,@function
main:                                   # @main
    .cfi_startproc
# BB#0:
    pushq    %rbp
.Ltmp0:
    .cfi_def_cfa_offset 16
    pushq    %rbx
.Ltmp1:
    .cfi_def_cfa_offset 24
    subq    $24, %rsp
.Ltmp2:
    .cfi_def_cfa_offset 48
.Ltmp3:
    .cfi_offset %rbx, -24
.Ltmp4:
    .cfi_offset %rbp, -16
    movl    $1, %ebx
    xorl    %ebp, %ebp
    callq    clock
    .align    16, 0x90
.LBB0_1:                                # %.preheader
                                        # =>This Loop Header: Depth=1
                                        #     Child Loop BB0_2 Depth 2
    movl    $1000000, %ecx          # imm = 0xF4240
    .align    16, 0x90
.LBB0_2:                                #   Parent Loop BB0_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
    imull     $3200000, %ebx         # imm = 0x30D400
    addl    $-168421000, %ebx       # imm = 0xFFFFFFFFF5F61978
    addl    $-5, %ecx
    jne    .LBB0_2
# BB#3:                                 #   in Loop: Header=BB0_1 Depth=1
    incl    %ebp
    cmpl    $1000, %ebp             # imm = 0x3E8
    jne    .LBB0_1
# BB#4:
    cvtsi2sdq    %rax, %xmm0
    movsd    %xmm0, 16(%rsp)         # 8-byte Spill
    xorps    %xmm0, %xmm0
    cvtsi2sdl    %ebx, %xmm0
    addsd    .LCPI0_0(%rip), %xmm0
    cvtsd2ss    %xmm0, %xmm0
    cvtss2sd    %xmm0, %xmm0
    addsd    .LCPI0_1(%rip), %xmm0
    cvtsd2ss    %xmm0, %xmm0
    cvtss2sd    %xmm0, %xmm0
    movsd    .LCPI0_2(%rip), %xmm1
    mulsd    %xmm1, %xmm0
    cvtsd2ss    %xmm0, %xmm0
    cvtss2sd    %xmm0, %xmm0
    mulsd    %xmm1, %xmm0
    movsd    %xmm0, 8(%rsp)          # 8-byte Spill
    divsd    .LCPI0_3(%rip), %xmm0
    movsd    %xmm0, (%rsp)           # 8-byte Spill
    callq    clock
    xorps    %xmm0, %xmm0
    cvtsi2sdq    %rax, %xmm0
    subsd    16(%rsp), %xmm0         # 8-byte Folded Reload
    movsd    %xmm0, 16(%rsp)         # 8-byte Spill
    movl    $.L.str, %edi
    movb    $2, %al
    movsd    8(%rsp), %xmm0          # 8-byte Reload
    movsd    (%rsp), %xmm1           # 8-byte Reload
    callq    printf
    movl    $.L.str1, %edi
    movb    $1, %al
    movsd    16(%rsp), %xmm0         # 8-byte Reload
    callq    printf
    xorl    %eax, %eax
    addq    $24, %rsp
    popq    %rbx
    popq    %rbp
    retq
.Ltmp5:
    .size    main, .Ltmp5-main
    .cfi_endproc

    .type    .L.str,@object          # @.str
    .section    .rodata.str1.1,"aMS",@progbits,1
.L.str:
    .asciz    "%f, %f\n"
    .size    .L.str, 8

    .type    .L.str1,@object         # @.str1
.L.str1:
    .asciz    "%10e\n"
    .size    .L.str1, 6


    .ident    "Debian clang version 3.5.0-10 (tags/RELEASE_350/final) (based on LLVM 3.5.0)"
    .section    ".note.GNU-stack","",@progbits


[ 本帖最后由 wfoo 于 2015-6-23 15:35 编辑 ]
2015-06-23 15:31
快速回复:改天研究一下 貌似GCC生成的汇编效率很渣
数据加载中...
 
   



关于我们 | 广告合作 | 编程中国 | 清除Cookies | TOP | 手机版

编程中国 版权所有,并保留所有权利。
Powered by Discuz, Processed in 0.032680 second(s), 7 queries.
Copyright©2004-2024, BCCN.NET, All Rights Reserved