关于memset(P, 0, NUM*sizeof(int))和P[NUM] = {0}的效率问题

const int NUM = 1024;1,
int P[NUM];
memset(P, 0, NUM*sizeof(int));2,int P[NUM] = {0};上述两种操作那个速度快？

解决方案 »

免费领取超大流量手机卡，每月29元包185G流量+100分钟通话, 中国电信官方发货

http://my.csdn.net/my/album/detail/1333388
的确得看汇编
如果你要初始化的空间较小的话几个字节后者效率高
但是你这个例子初始化4*1024这么大两者效率其实是一样的后者最终还是调用了memset
刚刚看了vc6汇编9:            memset(P, 0, NUM*sizeof(int));
004010BE   push        1000h
004010C3   push        0
004010C5   lea         eax,[ebp-1008h]
004010CB   push        eax
004010CC   call        memset (00401210)
004010D1   add         esp,0Ch
10:           int Q[NUM] = {0};
004010D4   mov         dword ptr [ebp-2008h],0
004010DE   mov         ecx,3FFh
004010E3   xor         eax,eax
004010E5   lea         edi,[ebp-2004h]
004010EB   rep stos    dword ptr [edi]
11:       }************************************************其中call memset
        public  memset
memset proc        .FPO    ( 0, 3, 0, 0, 0, 0 )        mov     edx,[esp + 0ch] ; edx = "count"
        mov     ecx,[esp + 4]   ; ecx points to "dst"        test    edx,edx         ; 0?
        jz      short toend     ; if so, nothing to do        xor     eax,eax
        mov     al,[esp + 8]    ; the byte "value" to be stored
; Align address on dword boundary        push    edi             ; preserve edi
        mov     edi,ecx         ; edi = dest pointer        cmp     edx,4           ; if it's less then 4 bytes
        jb      tail            ; tail needs edi and edx to be initialized        neg     ecx
        and     ecx,3           ; ecx = # bytes before dword boundary
        jz      short dwords    ; jump if address already aligned        sub     edx,ecx         ; edx = adjusted count (for later)
adjust_loop:
        mov     [edi],al
        inc     edi
        dec     ecx
        jnz     adjust_loopdwords:
; set all 4 bytes of eax to [value]
        mov     ecx,eax         ; ecx=0/0/0/value
        shl     eax,8           ; eax=0/0/value/0        add     eax,ecx         ; eax=0/0val/val        mov     ecx,eax         ; ecx=0/0/val/val        shl     eax,10h         ; eax=val/val/0/0        add     eax,ecx         ; eax = all 4 bytes = [value]; Set dword-sized blocks
        mov     ecx,edx         ; move original count to ecx
        and     edx,3           ; prepare in edx byte count (for tail loop)
        shr     ecx,2           ; adjust ecx to be dword count
        jz      tail            ; jump if it was less then 4 bytes        rep     stosd
main_loop_tail:
        test    edx,edx         ; if there is no tail bytes,
        jz      finish          ; we finish, and it's time to leave
; Set remaining bytestail:
        mov     [edi],al        ; set remaining bytes
        inc     edi        dec     edx             ; if there is some more bytes
        jnz     tail            ; continue to fill them; Done
finish:
        mov     eax,[esp + 8]   ; return dest pointer
        pop     edi             ; restore edi        rettoend:
        mov     eax,[esp + 4]   ; return dest pointer        retmemset  endp        end
后者直接从.data区copy，前者是run time循环运算，速度没办法比
说错了
前者主要耗时指令rep stosd
后者主要耗时指令rep stos
可能速度还是后者高，大批量情况下可忽略，memset很多在做对齐工作
最高的写法是。
int P[NUM] = {};
使用memset的话，编译器没有多少优化的余地，但初始化列表编译器就很容易做优化了。
最快的方法是设置成为  局部static，如果你了解局部静态的优势的话
static int P[NUM] ;前提是最快
我认为，调用memset也有额外的压栈指令和CALL指令，不仅增加了程序的大小，也增加了运行时间，而数组初始化是直接嵌入汇编的，所以效率当然高了。
请问局部静态变量不是存放在stack区的，会不会占用额外的全局内存空间？
我发现这样做的结果还是编译器调用memset函数。
我写了以下代码：#include <stdio.h>void foo()
{
    int p[100] = {};    printf("%d", p[0]);
    _asm int 3
}void main()
{
    foo();
}使用VC2008编译器编译成Release版本后的反汇编代码如下：
CPU DisasmAddress   Hex dump          Command                               Comments
0040104B  |.  8D4424 04     lea eax,[esp+4]                       ; |
0040104F  |.  6A 00         push 0                                ; |Arg2 = 0
00401051  |.  50            push eax                              ; |Arg1 => offset LOCAL.99
00401052  |.  E8 1B000000   call <jmp.&MSVCR90.memset>            ; \MSVCR90.memset
的确如此，但数组小的话就不会换成memset的了。直接使用memset的话不管数组大小都会有一次函数调用的开销。