拆数程序的后续

下面是 alphaX的代码主要部分
    for I := 1 to $00FFFFFF do
    begin
      temp0 := precarr[0][tbits(I).bytearray[0]];
      temp1 := precarr[1][tbits(I).bytearray[1]];
      temp2 := precarr[2][tbits(I).bytearray[2]];      temp0 := temp0 + temp1 + temp2;  // (3)      if abs(temp0 - sum) < 100 then   // (4)
      begin
        Result := I;
        Exit;
      end;
    end;其中(4)的地方 if abs(temp0 - sum) < 100 then  完全可以改成 temp0 - sum = 0 then
因为是正数运算了，那个误差没必要考虑了，计算前浮点属化整数的过程我已经处理了精度。
按照这个修改我的代码应该    t:= aTemp[0][tByteArray(i).Bytes[0]] + aTemp[1][tByteArray(i).Bytes[1]] +
        aTemp[2][tByteArray(i).Bytes[2]] + aTemp[3][tByteArray(i).Bytes[3]]；
    if t=sum then begin因为这个是循环的主体，所以调试的时候查看了汇编了代码，发现了一个问题(编译优化已经打开)看循环体部分的汇编代码00459569  |. 76 5B          JBE SHORT TakeApar.004595C6
0045956B  |. C745 FC 010000>MOV DWORD PTR SS:[EBP-4],1    t:= aTemp[0][tByteArray(i).Bytes[0]] + aTemp[1][tByteArray(i).Bytes[1]] +
        aTemp[2][tByteArray(i).Bytes[2]] + aTemp[3][tByteArray(i).Bytes[3]];
00459572  |> 0FB645 FC      /MOVZX EAX,BYTE PTR SS:[EBP-4]
00459576  |. 8BB485 F8EFFFF>|MOV ESI,DWORD PTR SS:[EBP+EAX*4-1008]
0045957D  |. 0FB645 FD      |MOVZX EAX,BYTE PTR SS:[EBP-3]
00459581  |. 03B485 F8F3FFF>|ADD ESI,DWORD PTR SS:[EBP+EAX*4-C08]
00459588  |. 0FB645 FE      |MOVZX EAX,BYTE PTR SS:[EBP-2]
0045958C  |. 03B485 F8F7FFF>|ADD ESI,DWORD PTR SS:[EBP+EAX*4-808]
00459593  |. 0FB645 FF      |MOVZX EAX,BYTE PTR SS:[EBP-1]
00459597  |. 03B485 F8FBFFF>|ADD ESI,DWORD PTR SS:[EBP+EAX*4-408]    if sum=t then begin
0045959E  |. 8B45 F8        |MOV EAX,DWORD PTR SS:[EBP-8]
004595A1  |. 8B80 5C040000  |MOV EAX,DWORD PTR DS:[EAX+45C]
004595A7  |. 99             |CDQ
004595A8  |. 52             |PUSH EDX
004595A9  |. 50             |PUSH EAX
004595AA  |. 8BC6           |MOV EAX,ESI
004595AC  |. 33D2           |XOR EDX,EDX
004595AE  |. 3B5424 04      |CMP EDX,DWORD PTR SS:[ESP+4]
004595B2  |. 75 03          |JNZ SHORT TakeApar.004595B7
004595B4  |. 3B0424         |CMP EAX,DWORD PTR SS:[ESP]
004595B7  |> 5A             |POP EDX
004595B8  |. 58             |POP EAX
004595B9  |. 75 05          |JNZ SHORT TakeApar.004595C0      Result:= i;
004595BB  |. 8B5D FC        |MOV EBX,DWORD PTR SS:[EBP-4]      break;
004595BE  |. EB 06          |JMP SHORT TakeApar.004595C6会发现这个  if sum=t then begin 居然翻出这么一大堆东西，天知道编译器(Delphi2005)怎么优化的于是乎换了个思路改成
    t:= aTemp[0][tByteArray(i).Bytes[0]] + aTemp[1][tByteArray(i).Bytes[1]] +
        aTemp[2][tByteArray(i).Bytes[2]] + aTemp[3][tByteArray(i).Bytes[3]] - sum;
    if t=0 then begin
      Result:= i;
      break;
    end;这次出来的代码是我想要的了
    t:= aTemp[0][tByteArray(i).Bytes[0]] + aTemp[1][tByteArray(i).Bytes[1]] +
        aTemp[2][tByteArray(i).Bytes[2]] + aTemp[3][tByteArray(i).Bytes[3]] - sum;
00459570  |> 0FB64D FC      /MOVZX ECX,BYTE PTR SS:[EBP-4]
00459574  |. 8B8C8D F8EFFFF>|MOV ECX,DWORD PTR SS:[EBP+ECX*4-1008]
0045957B  |. 0FB65D FD      |MOVZX EBX,BYTE PTR SS:[EBP-3]
0045957F  |. 038C9D F8F3FFF>|ADD ECX,DWORD PTR SS:[EBP+EBX*4-C08]
00459586  |. 0FB65D FE      |MOVZX EBX,BYTE PTR SS:[EBP-2]
0045958A  |. 038C9D F8F7FFF>|ADD ECX,DWORD PTR SS:[EBP+EBX*4-808]
00459591  |. 0FB65D FF      |MOVZX EBX,BYTE PTR SS:[EBP-1]
00459595  |. 038C9D F8FBFFF>|ADD ECX,DWORD PTR SS:[EBP+EBX*4-408]
0045959C  |. 8B5D F8        |MOV EBX,DWORD PTR SS:[EBP-8]
0045959F  |. 2B8B 5C040000  |SUB ECX,DWORD PTR DS:[EBX+45C]    if t=0 then begin  // 这个才看着正常本来就是一个比较嘛
004595A5  |. 85C9           |TEST ECX,ECX
004595A7  |. 75 05          |JNZ SHORT TakeApar.004595AE      Result:= i;
004595A9  |. 8B55 FC        |MOV EDX,DWORD PTR SS:[EBP-4]      break;
004595AC  |. EB 06          |JMP SHORT TakeApar.004595B4改了之后，32个数，无解的情况下 Pentium M 1.6 从96秒降到了 67秒。不知道是不是还有优化的办法。
alphaX，在你的机器看一看是不是也这样？另外，Delphi自带的CPU View怎么段拷贝？我这是用的OllyDbg反汇编的。

>>把预计算从8bit变成16bit能快1/5，参看汇编代码之后对此表示一点怀疑。如果预计算变成16bit的话，那么取操作数的次数少了2次，加法就减少了2个，变成for i := $FFFFFFFF to 0 do
begin
temp0 := precarr[0][tbits(I).bytearray[0]];
temp1 := precarr[1][tbits(I).bytearray[1]];...同时，在循环体内的可用寄存器又增加了，编译器优化的自由度就提高了，
不过，如果预计算变成16bit，bytearray就增大了，一级高速缓存的命中率会低于8bit预计算的情况
>>Delphi自带的CPU View怎么段拷贝
用鼠标指向需要复制的第一行，按下鼠标托拽到结束行，松开，右击得到弹出菜单，点Copy

>>Delphi自带的CPU View怎么段拷贝
用鼠标指向需要复制的第一行，按下鼠标托拽到结束行，松开，右击得到弹出菜单，点Copyd2006是这样，d2005不知道，如果不行可以装一个expert（vcl原码刨析的作者做了一个，书在家里，忘了作者名了）
拆数代码我又作了优化，代码见下，主要是作标量替换和loop unrollingP4 1.8G, 32个数无解的情况下, 6秒以内返回(D2006编译)
如果--"计算前浮点属化整数的过程我已经处理了精度"，应该还可以再快代码全部加了//，以保留格式//program Project14;
//
//{$APPTYPE CONSOLE}
//
//{.$define num23}
//{.$define pentium3}
//
//uses
//  SysUtils, Windows, Math;
//
//
//  //return the selection, 0 for not found
//  function p4_2(
//          const sum, mistake: Integer;
//          const nums: array of integer;
//          const noresult: boolean): Integer;
//  label 1;
//  type
//    tprec8 = array[byte] of integer;
//    pprec8 = ^tprec8;
//    tbits = record
//      bytearray: array[0..3] of byte;
//    end;
//  var
//    count, bitmask: integer;
//    i, j, k, l, m: Integer;
//    si, sj, sk, temp, sum_min, sum_max: Integer;
//    precarr: array[0..3] of tprec8;
//    {$IFDEF pentium3}
//    prec8_ptr: pprec8;
//    {$ENDIF}
//    prec8_1_ptr: pprec8;
//
//    pint: pinteger;
//
//    procedure precalc();
//    var
//      bits: array[0..7] of byte;
//      b: byte;
//      i, j, temp: Integer;
//      pIntArr: PIntegerArray;
//    begin
//      for j := 0 to 7 do
//        bits[j] := 1 shl j;
//
//      pIntArr := @nums[0];
//      for i := 0 to 3 do
//      begin
//        for b := low(b) to high(b) do
//        begin
//          temp := 0;
//          for j := 0 to 7 do
//          begin
//            if (b and bits[j]) > 0 then
//              inc(temp, pIntArr[j]);
//          end;
//
//          precarr[i][b] := temp;
//        end;
//
//        inc(PInteger(pIntArr), 8);
//      end;
//    end;
//
//    procedure checknums();
//    var
//      i: integer;
//      sum: Int64;
//    begin
//      sum := 0;
//      for i := low(nums) to high(nums) do
//        sum := sum + nums[i];
//
//      if sum > MaxInt then
//        raise Exception.Create('invalid paramter');
//    end;
//
//  begin
//    count := Length(nums);
//    Assert((count > 0) and (count <= 32));
//    Assert(sum > 0);
//
//    checknums();
//    precalc();
//
//
//
//    if count = 32 then
//      bitmask := not 0
//    else if count = 31 then
//      bitmask := not $80000000
//    else bitmask := 1 shl count - 1;
//
//    sum_min := sum - mistake;
//    sum_max := sum + mistake;
//
//    prec8_1_ptr := @precarr[1];
//
//    for i := tbits(bitmask).bytearray[3] downto 0 do
//    begin
//      si := precarr[3][i];
//      for j := tbits(bitmask).bytearray[2] downto 0 do
//      begin
//        sj := si + precarr[2][j];
//        for k := tbits(bitmask).bytearray[1] downto 0 do
//        begin
//          sk := sj + precarr[1][k];
//          {$IFDEF pentium3}
//          prec8_ptr := @precarr[0];
//          for L := $FF downto 0 do
//          begin
//            temp := sk + prec8_ptr[L];
//            if (temp > sum_min) and (temp < sum_max) then
//              goto 1;
//          end;
//          asm
//            prefetcht0 [offset prec8_1_ptr[k-1]]
//          end;
//          {$ELSE}
//          pint := @precarr[0][$ff];
//          for L := $1f downto 0 do
//          begin
//            temp := sk + pint^;
//            if (temp > sum_min) and (temp < sum_max) then
//              goto 1;
//            dec(pint);
//
//            temp := sk + pint^;
//            if (temp > sum_min) and (temp < sum_max) then
//              goto 1;
//            dec(pint);
//
//            temp := sk + pint^;
//            if (temp > sum_min) and (temp < sum_max) then
//              goto 1;
//            dec(pint);
//
//            temp := sk + pint^;
//            if (temp > sum_min) and (temp < sum_max) then
//              goto 1;
//            dec(pint);
//
//            temp := sk + pint^;
//            if (temp > sum_min) and (temp < sum_max) then
//              goto 1;
//            dec(pint);
//
//            temp := sk + pint^;
//            if (temp > sum_min) and (temp < sum_max) then
//              goto 1;
//            dec(pint);
//
//            temp := sk + pint^;
//            if (temp > sum_min) and (temp < sum_max) then
//              goto 1;
//            dec(pint);
//
//            temp := sk + pint^;
//            if (temp > sum_min) and (temp < sum_max) then
//              goto 1;
//            dec(pint);
//          end;
//          {$ENDIF}
//        end;
//      end;
//    end;
//
//    Result := 0;
//    Exit;
//
//  1:
//    {$IFDEF pentium3}
//    //the L is predicately, because we reference it in the inner loop body
//    m := L;
//    {$ELSE}
//    //because the L value is inpredicatable in the inner loop(due to the optimization),
//    //so we must calculate it iva the value of pint
//    m := $ff - (integer(pointer(@precarr[0][$ff])) - integer(pint)) div sizeof(integer);
//    {$ENDIF}
//
//    //i,j,k is predicately
//    result := (i shl 24) or (j shl 16) or (k shl 8) or m;
//  end;

//
//  procedure print_result(sum, mistake, r: integer; const nums: array of integer);
//  var
//    i, len, count, sum_verify: integer;
//  begin
//    len := Length(nums);
//    count := 0;
//    sum_verify := 0;
//    for i := 0 to len - 1 do
//      if (r and (1 shl i)) > 0 then
//      begin
//        Writeln('nums ', i:2, ': ', nums[i]:12);
//        Inc(sum_verify, nums[i]);
//        inc(count);
//      end;
//
//    Writeln('selection count: ', count);
//    if abs(sum_verify - sum) >= mistake then
//    begin
//      WriteLn('*****************************');
//      Writeln('ERROR');
//      WriteLn('*****************************');
//    end;
//  end;
//
//
//  type
//    tdata = record
//      sum: integer;
//      mistake: integer;
//      nums: array of integer;
//    end;
//
//
//  procedure make_data(const noresult: boolean; out data: tdata);
//  {$IFNDEF num23}
//  var
//    i: integer;
//  {$ENDIF}
//  begin
//    with data do
//    begin
//      sum := 743460984;
//      mistake := 100;
//
//      {$IFDEF num23}
//      SetLength(nums, 23);
//      {$ELSE}
//      SetLength(nums, 32);
//      {$ENDIF}
//
//      nums[0] := 78539724;
//      nums[1]  := 11989508;
//      nums[2]  := 16924851;
//      if noresult then
//        nums[3]  := 49903571
//      else nums[3]  := 49903471;   //
//      nums[4]  := 11989508;
//      nums[5]  := 13619134;
//      nums[6]  := 6792615;
//      nums[7]  := 159109415;  //
//      nums[8]  := 5457709;    //
//      nums[9]  := 10030376;
//      nums[10]  := 9932054;   //
//      nums[11]  := 20060777;  //
//      nums[12]  := 12329881;  //
//      nums[13]  := 23907053;  //
//      nums[14]  := 20318391;  //
//      nums[15]  := 20318391;  //
//      nums[16]  := 55625016;  //
//      nums[17]  := 64549082;  //
//      nums[18]  := 37358893;  //
//      nums[19]  := 39924860;  //
//      nums[20]  := 38673204;  //
//      nums[21]  := 45567426;  //
//      nums[22]  := 140425361; //
//
//      {$IFNDEF num23}
//      Randomize();
//      for I := 23 to 31 do
//        nums[23] := RandomRange(10000000, 20000000);
//      {$ENDIF}
//    end;
//  end;
//
//
//  procedure test(noresult: boolean);
//  var
//    data: tdata;
//    r: integer;
//    li_freq, li_begin, li_end: TLargeInteger;
//    msecs: double;
//  begin
//    if noresult then
//      Writeln('case: noresult = true')
//    else Writeln('case: noresult = false');
//    make_data(noresult, data);
//    if not QueryPerformanceFrequency(li_freq) then
//      RaiseLastOSError();
//    QueryPerformanceCounter(li_begin);
//    r := p4_2(data.sum, data.mistake, data.nums, noresult);
//    QueryPerformanceCounter(li_end);
//
//    msecs := (li_end - li_begin) * 1000 / li_freq;
//    Writeln('use time(milli seconds): ', FormatFloat('0.00', msecs));
//    Writeln('selection=', inttohex(r, 8));
//    if r <> 0 then
//      print_result(data.sum, data.mistake, r, data.nums);
//    Writeln('');
//
//    if noresult and (r <> 0) then
//      test(true);
//  end;
//
//  procedure pause();
//  var
//    s: string;
//  begin
//    Writeln('press ENTER to continue...');
//    Readln(s);
//  end;
//
//
//begin
//  test(true);
//  test(false);
//  pause();
//end.

你这段
if sum=t then begin|. 8B45 F8 |MOV EAX,DWORD PTR SS:[EBP-8]
|. 8B80 5C040000 |MOV EAX,DWORD PTR DS:[EAX+45C]   //取sum
|. 99 |CDQ  //带符号将sum扩展成双字
|. 52 |PUSH EDX
|. 50 |PUSH EAX //将双字推到stack|. 8BC6 |MOV EAX,ESI //扩展t, 低位在eax
|. 33D2 |XOR EDX,EDX //高位在edx, 为0|. 3B5424 04 |CMP EDX,DWORD PTR SS:[ESP+4] //比较先前push的edx
|. 75 03 |JNZ SHORT TakeApar.004595B7
|. 3B0424 |CMP EAX,DWORD PTR SS:[ESP]      //比较先前push的eax
|> 5A |POP EDX  //平衡stack
|. 58 |POP EAX  //平衡stack
|. 75 05 |JNZ SHORT TakeApar.004595C0      //如果同则....看起来应该是一个integer和cardinal，进行比较，很可能你的t是cardinal,而sum是integer,在这种情况下,编译是正确的后来,你先减sum然后在测试是否为0,代码就简化了,因为加减法实质上是无符号的,而测试0就一条指令

跑哪儿过年去了？
我再看看，谢谢。D2006的确能拷贝CPU VIEW了。D2005以前的还是不行。

上面的代码有点问题，主要是没有考虑到length(nums) < 32的情况
改成:
    procedure precalc();
    var
      bits: array[0..7] of byte;
      b: byte;
      i, j, temp: Integer;
      pIntArr: PIntegerArray;
      temp_nums: array[0..31] of Integer;
    begin
      for j := 0 to 7 do
        bits[j] := 1 shl j;      fillchar(temp_nums[0], sizeof(temp_nums), 0);      for i := 0 to count-1 do
        temp_nums[i] := nums[i];      pIntArr := @temp_nums[0];
      for i := 0 to 3 do
      begin
        for b := low(b) to high(b) do
        begin
          temp := 0;
          for j := 0 to 7 do
          begin
            if (b and bits[j]) > 0 then
              inc(temp, pIntArr[j]);
          end;          precarr[i][b] := temp;
        end;        inc(PInteger(pIntArr), 8);
      end;
    end;
应该可以了另外,我刚才下班后又测试了另外一个思路,应该可以思路是:
把最内层的预计算排序一下
然后从预计算的结果中进行二分查找

    //最内层的预计算结果
    t_prec_result = class
      value: integer; //计算的值
      bitmask: byte; //L的值
    end;    t_scan_table = class
    public
      sum: integer;
      list: tlist;

      procedure sortASC(); //排序      function Scan(const aBaseValue: integer): t_prec_result; //二分查找
    end;

precarr: array[0..3] of tprec8;
    变成
precarr: array[0..2] of tprec8;


预计算变成

var
  temp_nums: array[0..31] of integer;

      pIntArr := @temp_nums[8];      //8..31，放在precarr[0..2]
      for i := 0 to 2 do
      begin
        for b := low(b) to high(b) do
        begin
          temp := 0;
          for j := 0 to 7 do
          begin
            if (b and bits[j]) <> 0 then
              inc(temp, pIntArr[j]);
          end;          precarr[i][b] := temp;
        end;        inc(PInteger(pIntArr), 8);
      end;      //0..7, 放在scan_table内
      for b := low(b) to high(b) do
      begin
        temp := 0;
        for j := 0 to 7 do
        begin
          if (b and bits[j]) <> 0 then
            inc(temp, temp_nums[j]);
        end;        pr := t_prec_result.Create();
        pr.value := temp;
        pr.bitmask := b;
        scan_table.list.Add(pr);
      end;

      scan_table.sortASC(); //按照pr.value进行升序排序

主要循环就变成
      for i := tbits(bitmask).bytearray[3] downto 0 do
      begin
        si := precarr[2][i];
        for j := tbits(bitmask).bytearray[2] downto 0 do
        begin
          sj := si + precarr[1][j];
          for k := tbits(bitmask).bytearray[1] downto 0 do
          begin
            sk := sj + precarr[0][k];

            prec_result := scan_table.scan(sk);
            if prec_result <> nil then
            begin
              result := prec_result.bitmask or (i shl 24) or (j shl 16) or (k shl 8);
              exit;
            end;
          end;
        end;
     end;Scan的代码,    function t_scan_table.Scan(const aBaseValue: integer): t_prec_result;
    var
      i, min_index, max_index, temp: integer;
      {$ifdef debug_and_trace}
      next_index: integer;
      {$endif}
      ptrlist: PPointerList;
    begin
      max_index := list.count-1;
      min_index := 0;      ptrlist := list.List;      i := max_index shr 1;      while true do
      begin
        Result := t_prec_result(ptrlist[i]);
        {$ifdef debug_and_trace}
        _index_[i] := true;
        inc(_count_);
        {$endif}        temp := (Result.value + aBaseValue);        if (temp = sum) then
          Exit                        //ok
        else if temp > sum then
        begin
          if i = 0 then
            Break;                    //exit          max_index := i-1;          {$ifdef debug_and_trace}
          next_index := (min_index + max_index) shr 1;
          Assert(next_index < i);
          i := next_index;
          {$else}
          i := (min_index + max_index) shr 1;
          {$endif}
          if min_index > i then
            Break;                    //exit
        end
        else //if (temp < sum) then
        begin
          if i = max_index then
            Break;                    //exit          min_index := i + 1;          {$ifdef debug_and_trace}
          next_index := (min_index + max_index+1) shr 1;
          Assert(next_index > i);
          i := next_index;
          {$else}
          i := (min_index + max_index + 1) shr 1;
          {$endif}
        end;
      end;      Result := nil;
    end;            这样，比原来多了个排序开销,
但原来最内层的256次操作，通过二分查找法降低到最多8次，即使算上二分查找和调用的开销，
也应该低于原来的256次操作

我在这台P4 1.8G上测得32个数,无解, 1.3秒返回

本来想把最内两层的65536次操作,都通过二分查找法来作的,这样可以把操作从65536次降低到最多16次,
就远远快于原来的操作了,但是考虑到预先的排序过程确是比较慢的,65536个数排序最坏时复杂度
是$FFFFFFFF,这就得不偿失了

十分感谢。
和我的相反有一些地方类似。
我也在考虑把拆数的大循环改称几个嵌套的循环，来减少4个数做加法消耗的时间，
我的想法：
1.上一个帖子有人提出过排序，然后计算出来至少和至多需要多少个数m和n。
2. 采用你的预计算方式，计算的同时统计与计算结果中每个结果中bit位1的个数；
3. 在循环的适当深度计算bit位上1的个数，如果大于n，或者即使下层循环全是1，1的个数也小于m，则往下循环的工作不用作了，直接Continue.这个可以理解为是剪枝的工作。
我没试过，不知道他增加开销和剪枝的效果是否值。不论如何，32个数无解的情况能降低到1.x秒，已经完全满足实际应用了。alphaX你看看这个想法可行不？

pazee大哥，你说得我有点不好意思了，哈哈上面提到的问题很难笼统说，依赖于循环的层数和最内层的开销，我对我当前的实现(我目前实现的是40个数，以下说的都是40位的情形)大致估算了一下:假设最内层的开销为300时钟(pentium3, 65536个结果中折二查找, p4为270左右)，
位计数和检查的开销为5时钟，并假设位计数和检查放在第3层(针对n >= 11 and n <= 13的情形，
对于n = 8的情况可以放在第2层，不过道理是一样的),即k循环层，
for i ...
for j ...
  ...
  bit_count_j := bit_count_map[i] + bit_count_map[j];
for k ...
begin
  ...
  bit_count_k := bit_count_j + bit_count_map[k];
  if bit_count_k > n then
    continue;  scan(65536个结果中折二查找)
  ...
end //k
...增加的开销 > ($100 0000 * 5 = $500 0000)当n=12时，节省的开销 C(12, 24) * 300 = ($29 431C) * 300 = $305A A4D0
当n=11 or 13时, 节省 $2CA2 70C0原来的整体开销 ($100 0000 * 300) 加上排序后约为 $1 7000 0000
改进后为 $1 7000 0000 + $500 0000 - $3000 0000 = $1 4500 0000也就是，如果位计数和检查的开销为5的估计准的话，n=12时时间开销可能只能
降低到原来的88%，也就是1.13倍的提高，
而成本是必须为n>=11 and n<=13的情形维护另外一个程序版本。
但位计数和检查的开销为5的估计是非常乐观的，
k层内增加的操作有：访存一次,加法一次,减法一次,测试一次,这些操作
都假设寄存器不是很紧张，而且访存命中，但实际上在外3层循环里,
每层里都有计算,实际上很难有这么好的情形,
并且一直以来delphi的对循环体的寄存器分配是比较弱的，
增加位计数累加和检查,会明显增加delphi分配寄存器的压力，
上次你也看到了，就是你前面说的delphi怎么优化的那一段，
他为了腾出寄存器，竟然在循环的最内层push/pop，其实在外层腾出寄存器，
最内层就不必push/pop了，他却不知道他所产生的push/pop是G数量级的,
还有一个问题就是增加的判断：
if bit_count > n then
...
当bit_count > n成立时，就等于pentium的指令预取判断错误,执行就停顿,
必须重新取指,所以平均增加的开销很可能不是5，可能达到8甚至更多，
所以，在外层只有3层而最内层操作开销不是很大的情况下，
加插这个位数检查暂时可能不会有太大的益处，
如果外层有4层(48个数的情形)，那么在第3层做这个处理是非常有必要的另外，我又改进了一下，原来考虑排序的算法是quick sort，
因为要确保程序在比较确定的时间内返回，所以没敢作16位排序，只做了8位的，
后来想到，可以在预计算时想办法确保结果值有序，然后用归并排序归并，
归并主体的复杂度约为 8n，即$8 0000次比较,这样就可以把最内2层，用二分查找来做了，我的最新测试，32位数无解情况下，p4 1.8，所花的时间0.02秒多一点
现已通过32位的正确性测试，现在作一下40位的测试并整理一下代码

抱歉，上面的计算错了，算漏了bit_count > n的情形
简单的，撇开预计算和排序部分的开销，单就循环主体的开销来计算
主循环主体循环次数 = $100 0000如果在第三层进行处理, n  = 11 时命中的次数(即bit_count > n)为 = C(12,24) + C(13,24) + ... + C(24,24)
不命中的次数为 = C(11,24) + C(10, 24) + ... + C(1,24)
               = C(13,24) + C(14,24) + .. + C(24,24)命中-不命中=C(12,24)
c(12,24) = $29 431C需要执行的(不命中的)= ($100 0000 - C(12,24)) / 2 = $6B 5E72也就是循环主体的循环次数下降到原来的42% ($6b 5e72 / $100 0000)
第二层，n = 7 时命中的次数为 = C(8,16) + C(9,16) + ... + C(16,16)
不命中次数为 = C(7,16) + C(6,16) + ... + C(1,16)
             = C(9,16) + ....
命中-不命中=C(8,16)
C(8,16) = 12870
不命中=($1 0000 - 12870) / 2 = 26333
循环次数下降到原来的40%
依此看来, 针对这些特定情形作带位计算处理的版本，还是能够得到2倍的性能提高的
我想了一下,>>3. 在循环的适当深度计算bit位上1的个数，如果大于n，
>>或者即使下层循环全是1，1的个数也小于m，
>>则往下循环的工作不用作了for i ...
for j ...
...
sum_i_j := prec_arr[2][i] + prec_arr[1][j];
for k ...
begin
...
sum_i_j_k := sum_i_j + prec_arr[0][k];对于n的判断，
实际上直接比较sum和sum_i_j_k就等价于判断i/j/k的位数数否大于n了，即
if sum_i_j_k > sum then
  continue;而对于m的判断，可以预先计算nums[0..15]的和，得到sum_0_15,
将sum减去sum_0_15得到一个sum2然后同样在k层
if sum_i_j_k < sum2 then
  continue;这样也等价于判断m，这样其实位计数和检查就简化了

有点怪，昨晚我改了一下，
现在40个数无解情况下竟然只要0.2秒不到，而32个数，24个数都要0.015，有点不解，不知道是否代码错了，但是测试又没发现问题
project文件
============================================
program Project15;{$APPTYPE CONSOLE}uses
  SysUtils,
  pack5 in 'pack5.pas';var
  e: tobject;
begin
  try
    t_pack_tester.full_test();
  except
    e := ExceptObject();
    if e is Exception then
      writeln('ERROR: ', exception(e).Message)
    else writeln('ERROR: (Unknown error)');
  end;
end.

unit pack5;{.$define debug}
{.$define trace}
{$define pack5_test}{$ifdef pack5_test}
  {$define perf_meas} //正式使用时，应关闭perf_meas
{$endif}
interface
uses
  SysUtils, Classes, Math
  {$ifdef pack5_test }
  , Windows
  {$endif}
  ;const
  max_num_count = 40;
  bits_per_byte = 8;
  bits: array[0..bits_per_byte-1] of cardinal = (
          1 shl 0,
          1 shl 1,
          1 shl 2,
          1 shl 3,
          1 shl 4,
          1 shl 5,
          1 shl 6,
          1 shl 7
        );
  scan_not_found = $ffffffff;type
  t_pack_selection = array of byte;type
  t_prec_result = record
    value: cardinal;
    bitmap: cardinal;
  end; //8 bytes
  p_prec_result = ^t_prec_result;  t_prec_result_list = array[0..($100 * $100-1)] of t_prec_result;
  p_prec_result_list = ^t_prec_result_list;  t_prec_result_list256 = array[low(byte)..high(byte)] of t_prec_result;
  p_prec_result_list256 = ^t_prec_result_list256;  t_8nums = array[0..bits_per_byte-1] of cardinal;
  p_8nums = ^t_8nums;  t_prec_8 = array[byte] of cardinal;
  p_prec_8 = ^t_prec_8;
  t_bits = array[0..3] of byte;
  p_bits = ^t_bits;{$ifdef pack5_test}
  t_data_sample = class
  public
    no_result: boolean;
    init_selection: t_pack_selection;
    sum: cardinal;
    num_count: cardinal;
    min_selection_count: integer;
    nums: array of cardinal;
    perf_cnt_begin, perf_cnt_end: tlargeinteger;    constructor create(
            const a_no_result: boolean;
            const a_num_count: cardinal;
            const a_min_selection_count: integer
          );    procedure exec(const print_found_selection: boolean);

    function perf_cnt_period_to_ms(): double;      function verify(const selection: t_pack_selection; const print_nums: boolean): boolean;

  end;

  t_pack_tester = class
  public
    class procedure full_test;
  end;

{$endif pack5_test}  t_pack_searcher = class
  private (*** common private part ***)
    num_count: integer;
    sum: cardinal;
    no_result: boolean;
    nums_copy: array[0..39] of cardinal;
    require_order_map: boolean;    //for middle or large case
    prec_result_list: t_prec_result_list;
    prec_result_list_sz: cardinal;    //指出原来的顺序，取nums_order_map[i]，可以得到当前num_copy中第i个数的原来的序号
    //仅当require_order_map时有意义
    nums_order_map: array[0..39] of cardinal;    prec_8_arr: array[0..2] of t_prec_8;
    selection_sz: integer;    procedure sort_num_copies();    {$ifdef debug}
    procedure check_bitmap(
            const nums: array of cardinal;
            const bitmap: cardinal;
            const value: cardinal);
    procedure check_prec_result_bitmap(
            const nums: array of cardinal;
            p: p_prec_result;
            count: integer
          );
    {$endif}      procedure map_to_original_order(var selection: t_pack_selection);    procedure make_empty_pack_selection(var selection: t_pack_selection);    {
      pre-calculate 8 nums all posible sum of 8 nums combination
      nums_copy_index:  the first num index of nums_copy
      prec_8_arr_index: the result sum will put in prec_8_arr[prec_8_arr_index][]
    }
    procedure prepare_prec_8_arr(
            const nums_copy_index: integer;
            const prec_8_arr_index: integer
          );    procedure prepare_dummy_prec_8_arr(const prec_8_arr_index: integer);  private (*** private for small case ***)
    //for the case num_count < 16
    function internal_exec_smallcase(): t_pack_selection;  private (*** private for middle case ***)
    //pre-calcuate the results for all posible nums[0]..nums[8] combination
    //sort and store in prec_result_list
    procedure prepare_middle;
    //for the case num_count >= 16
    function internal_exec_ge16(): t_pack_selection;  private (*** private for large case ***)
    //pre-calcuate the results for all posible nums[0]..nums[15] combination
    //sort and store in prec_result_list
    procedure prepare_large;    //return value: low 16 bit selection
    //return $ffffffff for not found
    function scan2(const base_value: cardinal): cardinal;    //for the case num_count >= 24
    function internal_exec_ge24: t_pack_selection;  public
    constructor create(
            const a_no_result: boolean;
            const a_sum: cardinal;
            const a_nums: array of cardinal
          );    class procedure test_scan_proc;    function exec(): t_pack_selection;
  end;function is_empty_pack_selection(const selection: t_pack_selection): boolean;
function calc_selection_size(const num_count: integer): integer;
procedure print_selection(const selection: t_pack_selection);function test_selection(
        const selection: t_pack_selection;
        const index: integer): boolean;
implementationprocedure pause();
var
  s: string;
begin
  Writeln('press ENTER to continue...');
  Readln(s);
end;{$ifdef debug}
//print nums, and return the sum
function nums_sum(
      const nums: array of cardinal;
      const bitmap: cardinal;
      const print_nums: boolean): cardinal;
var
  i, max: integer;
begin
  result := 0;
  max := high(nums);
  if max > 31 then
    max := 31;
  for i := low(nums) to max do
    if bitmap and (1 shl i) <> 0 then
    begin
      inc(result, nums[i]);
      if print_nums then
        writeln('has_num(', i:2, '): ', nums[i]:12);
    end;
end;procedure touch(var x);
begin
  //do nothing
end;
{$endif}{$ifdef pack5_test}
procedure indent(spc_cnt: integer);
var
  s: string;
begin
  if spc_cnt > 0 then
  begin
    setlength(s, spc_cnt);
    fillchar(s[1], spc_cnt, ' ');
    write(s);
  end;
end;
{$endif}function is_empty_pack_selection(
        const selection: t_pack_selection
      ): boolean;
var
  i: integer;
begin
  result := false;
  for i := low(selection) to high(selection) do
    if selection[i] <> 0 then
      exit;  result := true;
end;
function calc_selection_size(const num_count: integer): integer;
begin
  result := (num_count + (bits_per_byte-1)) div bits_per_byte;
end;function test_bit(const p: pointer; const index: integer): boolean;
var
  byte_index, bit_index: integer;
begin
  byte_index := index div 8;
  bit_index := index mod 8;  result := (pbytearray(p)[byte_index] and bits[bit_index]) <> 0;
end;function test_selection(
        const selection: t_pack_selection;
        const index: integer): boolean;
begin
  {$ifdef debug}
  assert(length(selection) > 0);
  {$endif}  result := test_bit(@selection[0], index);
end;
{$ifdef trace}
var
  merge_move_count: integer;
  scan_hit_count: integer;
{$endif}

{$ifdef debug}
type
  t_cardinal_arr = array of cardinal;
function copy_nums(
        const nums: array of cardinal;
        const copy_start_index: integer;
        const count: integer): t_cardinal_arr;
begin
  assert(copy_start_index >= 0);
  assert((copy_start_index + count) <= length(nums));

  setlength(result, count);
  if count > 0 then
    move(nums[copy_start_index], result[0], count * sizeof(result[0]));
end;
{$endif}{ Copy from Borland.Classes.pas/implementation }
procedure QuickSort(SortList: PPointerList; L, R: Integer;
  SCompare: TListSortCompare);
var
  I, J: Integer;
  P, T: Pointer;
begin
  repeat
    I := L;
    J := R;
    P := SortList^[(L + R) shr 1];
    repeat
      while SCompare(SortList^[I], P) < 0 do
        Inc(I);
      while SCompare(SortList^[J], P) > 0 do
        Dec(J);
      if I <= J then
      begin
        T := SortList^[I];
        SortList^[I] := SortList^[J];
        SortList^[J] := T;
        Inc(I);
        Dec(J);
      end;
    until I > J;
    if L < J then
      QuickSort(SortList, L, J, SCompare);
    L := I;
  until I >= R;
end;procedure set_bit(p: pointer; bit_index: integer);
var
  byte_index: integer;
begin
  byte_index := bit_index div 8;
  bit_index := bit_index mod 8;
  pbytearray(p)[byte_index] := pbytearray(p)[byte_index] or bits[bit_index];
end;

function nums_copy_sort_compare_asc(a, b: pointer): integer;
begin
  if cardinal(a) = cardinal(b) then
    result := 0
  else if cardinal(a) > cardinal(b) then
    result := 1
  else result := -1;
end;function prec_result_sort_compare_asc(a, b: pointer): integer;
var
  x, y: p_prec_result;
begin
  x := a; y := b;
  if x^.value = y^.value then
    result := 0
  else if x^.value > y^.value then
    result := 1
  else result := -1;
end;
procedure calc_prec_result_list256(
        in_8nums_ptr: p_8nums;
        out_prec_result_temp: p_prec_result_list256);
{
  预计算8个数的各种可能的组合的和，并按和的大小升序排序
}
var
  i, j: integer;
  temp: cardinal;
  p: p_prec_result;
  ptr_array_ff: array[low(byte)..high(byte)] of p_prec_result;
  buff: t_prec_result_list256;
begin
  //预计算8个数的各种可能的组合的和
  for i := low(byte) to high(byte) do
  begin
    temp := 0;
    for j := 0 to bits_per_byte-1 do
      if (i and bits[j]) <> 0 then
        temp := temp + in_8nums_ptr[j];    p := @buff[i];
    p^.value := temp;
    p^.bitmap := i;    ptr_array_ff[i] := p;
  end;
  //排序
  quicksort(
          @ptr_array_ff,
          low(ptr_array_ff),
          high(ptr_array_ff),
          prec_result_sort_compare_asc
        );

  //now data in buff is sorted, copy to out_prec_result_temp
  for i := low(buff) to high(buff) do
    out_prec_result_temp[i] := ptr_array_ff[i]^;
end;procedure merge(x, y, result: p_prec_result; count: integer);
var
  x_cnt, y_cnt: integer;
begin
  x_cnt := count;
  y_cnt := count;  repeat
    if x^.value <= y^.value then
    begin
      //put x
      result^ := x^;
      inc(x);
      dec(x_cnt);
    end
    else
    begin
      //put y
      result^ := y^;
      inc(y);
      dec(y_cnt);
    end;    inc(result);
  until (x_cnt = 0) or (y_cnt = 0);  if (x_cnt > 0) then
    move(x^, result^, sizeof(x^) * x_cnt)
  else if (y_cnt > 0) then
    move(y^, result^, sizeof(y^) * y_cnt);  {$if defined(trace)}
  inc(merge_move_count, x_cnt + y_cnt);
  {$ifend}
end;{$ifdef debug}
procedure verify_order_le(first_prec_result: p_prec_result; count: integer);
var
  value: cardinal;
begin
  value := 0;
  while count > 0 do
  begin
    if value > first_prec_result^.value then
    begin
      asm int 3 end;
      touch(first_prec_result); touch(count);
      writeln('verify_order_le failed');
      pause();
    end;    value := first_prec_result^.value;
    inc(first_prec_result);
    dec(count);
  end;
end;
{$endif}
{$ifdef debug}
procedure t_pack_searcher.check_bitmap(
        const nums: array of cardinal;
        const bitmap, value: cardinal);
var
  s: cardinal;
begin
  s := nums_sum(nums, bitmap, false);
  if s <> value then
  begin
    nums_sum(nums, bitmap, true);
    asm
      int 3
    end;
    writeln('check_bitmap failed,  sum is ', s, ', expected is ', value);
    pause();
  end;
end;procedure t_pack_searcher.check_prec_result_bitmap(
        const nums: array of cardinal;
        p: p_prec_result;
        count: integer);
begin
  while count > 0 do
  begin
    check_bitmap(nums, p^.bitmap, p^.value);
    dec(count);
    inc(p);
  end;
end;
{$endif debug}constructor t_pack_searcher.create(
        const a_no_result: boolean;
        const a_sum: cardinal;
        const a_nums: array of cardinal);
var
  i: integer;
  sum_64: int64;
begin
  sum := a_sum;

  assert(not a_no_result or (sum > 0));  num_count := length(a_nums);  assert(
          (num_count > 0)
          and
          (num_count <= max_num_count)
        );  no_result := a_no_result;  sum_64 := 0;
  for i := low(a_nums) to high(a_nums) do
    inc(sum_64, a_nums[i]);  if sum_64 > (high(sum)-1) then      //high(cardinal) is reserved for no_result test
    raise exception.create('Invalid parameter, sum of nums is too large');  selection_sz := calc_selection_size(num_count);  move(a_nums[0], nums_copy[0], length(nums_copy) * sizeof(a_nums[0]));
end;
function t_pack_searcher.exec(): t_pack_selection;
begin  {$ifdef trace}
  scan_hit_count := 0;
  merge_move_count := 0;
  {$endif}

  if num_count >= 24 then
    result := internal_exec_ge24()
  else if num_count >= 16 then
    result := internal_exec_ge16()
  else
    result := internal_exec_smallcase();  {$ifdef trace}
  writeln('scan hit count: ', scan_hit_count);
  writeln('merge_move_count: ', merge_move_count);
  {$endif}
end;//for the case num_count >= 16
function t_pack_searcher.internal_exec_ge16: t_pack_selection;
var
  i, j: integer;
  bits: integer;
  temp, value_i: cardinal;
  scan_result, sum_0_7: cardinal;  procedure make_result(use_scan_result: boolean);
  begin
    setlength(result, selection_sz);
    if use_scan_result then
      result[0] := scan_result and $00ff
    else
      Result[0] := 0;    result[1] := j;
    if selection_sz > 2 then
      result[2] := i;    //map_to_original_order(result);
  end;begin
  prepare_middle();  prepare_prec_8_arr(       8, 0);

  if num_count > 16 then
    prepare_prec_8_arr(    16, 1)
  else prepare_dummy_prec_8_arr(1);  bits := 1 shl (num_count - 8) - 1;
  { reserved 8 bits is processed by scan() procedure }

  scan_result := scan_not_found;  {$ifndef perf_meas}
  sum_0_7 := 0;
  for i := 0 to 7 do
    sum_0_7 := sum_0_7 + nums_copy[i];
  {$else}
  //因为no_result时产生的sum为high(cardinal)-1
  //所以下面的if temp > sum_0_7 then
  //总是成立，这将影响性能测试的准确性
  //所以应该设为
  sum_0_7 := high(cardinal);
  {$endif}
  for i := t_bits(bits)[1] downto 0 do
  begin
    value_i := prec_8_arr[1][i];
    for j := t_bits(bits)[0] downto 0 do
    begin
      temp := sum - (value_i + prec_8_arr[0][j]);

      //即使 integer(temp) < 0
      //也有 temp <= sum_0_7
      if temp > sum_0_7 then
        Continue;

      if (temp = 0) then
      begin
        make_result(false);
        exit;
      end
      else if (integer(temp) < 0) then
        continue;
      scan_result := scan2(temp);
      if scan_result <> scan_not_found then
      begin
        make_result(true);
        exit;
      end;
    end;
  end;  make_empty_pack_selection(result);
end;

//for the case num_count >= 24
function t_pack_searcher.internal_exec_ge24: t_pack_selection;
var
  i, j, k: integer;
  bits: integer;
  temp, value_i, value_i_j: cardinal;
  scan_result, sum_0_15: cardinal;  procedure make_result(use_scan_result: boolean);
  begin
    setlength(result, selection_sz);
    if use_scan_result then
    begin
      result[0] := scan_result and $00ff;
      result[1] := (scan_result and $ff00) shr 8;
    end
    else
    begin
      result[0] := 0;
      result[1] := 0;
    end;

    result[2] := k;
    if selection_sz > 3 then
    begin
      result[3] := j;
      if selection_sz > 4 then
        result[4] := i;
    end;    map_to_original_order(result);
  end;begin
  //pre-calculate sums of 0..15
  prepare_large();  //prec-calculate sums of 16..39
  prepare_prec_8_arr(   16,   0);  if num_count > 24 then
    prepare_prec_8_arr( 24,   1)
  else prepare_dummy_prec_8_arr(1);  if num_count > 32 then
    prepare_prec_8_arr( 32,   2)
  else prepare_dummy_prec_8_arr(2);  bits := 1 shl (num_count - 16) - 1;
  { reserved 16 bits is processed by scan() procedure }  {$ifndef perf_meas}
  sum_0_15 := 0;
  for i := 0 to 15 do
    sum_0_15 := sum_0_15 + nums_copy[i];
  {$else}
  //因为no_result时产生的sum为high(cardinal)-1
  //所以下面的if temp > sum_0_15 then
  //总是成立，这将影响性能测试的准确性
  //所以应该设为
  sum_0_15 := high(cardinal);
  {$endif}
      scan_result := scan_not_found;  for i := t_bits(bits)[2] downto 0 do
  begin
    value_i := prec_8_arr[2][i];
    for j := t_bits(bits)[1] downto 0 do
    begin
      value_i_j := value_i + prec_8_arr[1][j];
      for k := t_bits(bits)[0] downto 0 do
      begin
        temp := sum - (value_i_j + prec_8_arr[0][k]);        //即使 integer(temp) < 0
        //也有 temp <= sum_0_15
        if temp > sum_0_15 then
          continue;        if (temp = 0) then
        begin
          make_result(false);
          exit;
        end
        else if integer(temp) < 0 then
          continue;        scan_result := scan2(temp);
        if scan_result <> scan_not_found then
        begin
          make_result(true);
          exit;
        end;
      end; // for k
    end; //for j
  end; //for i
  make_empty_pack_selection(result);
end;//for the case num_count < 16
function t_pack_searcher.internal_exec_smallcase: t_pack_selection;
var
  i, j: integer;
  bits: integer;
  value_i: cardinal;
begin
  prepare_prec_8_arr(0, 0);  if num_count > 8 then
    prepare_prec_8_arr(8, 1)
  else prepare_dummy_prec_8_arr(1);  bits := (1 shl num_count) - 1;  for i := t_bits(bits)[1] downto 0 do
  begin
    value_i := prec_8_arr[1][i];
    for j := t_bits(bits)[0] downto 0 do
    begin
      if (value_i + prec_8_arr[0][j]) = sum then
      begin
        setlength(result, selection_sz);
        result[0] := j;
        if selection_sz > 1 then
          result[1] := i;
        exit;
      end;
    end;
  end;  make_empty_pack_selection(result);
end;procedure t_pack_searcher.make_empty_pack_selection(
  var selection: t_pack_selection);
begin
  if length(selection) <> selection_sz then
    setlength(selection, selection_sz);  fillchar(
          selection[0],
          sizeof(selection[0]) * selection_sz,
          0
        );
end;procedure t_pack_searcher.map_to_original_order(
  var selection: t_pack_selection);
var
  b: byte;
  i, j, bit_index_base, org_index: integer;
  temp: t_pack_selection;
begin
  if not require_order_map then
    exit;  setlength(temp, length(selection));  for i := low(selection) to high(selection) do
  begin
    b := selection[i];
    bit_index_base := i * 8;
    for j := 0 to bits_per_byte - 1 do
    begin
      if (b and bits[j]) <> 0 then
      begin
        org_index := nums_order_map[bit_index_base + j];
        set_bit(@temp[0], org_index);
      end;
    end;
  end;

  move(
      temp[0],
      selection[0],
      sizeof(temp[0]) * length(temp)
    );
end;procedure t_pack_searcher.prepare_dummy_prec_8_arr(
  const prec_8_arr_index: integer);
begin
  fillchar(
          prec_8_arr[prec_8_arr_index],
          sizeof(prec_8_arr[prec_8_arr_index]),
          0
        );
end;

procedure t_pack_searcher.prepare_large;
var
  prec_results_temp_lo: t_prec_result_list256;              //size=$100 * 8
  prec_results_temp_hi: t_prec_result_list256;              //size=$100 * 8;
  //required stack size is $1 000  i, j: cardinal;
  iter_lo, iter_hi: integer;
  p_lo, p_hi, p_dest: p_prec_result;  prec_result_list_temp,
  prec_result_list_src,
  prec_result_list_dst,
  swap: p_prec_result_list;  dest_is_temp: boolean;
begin
  //对num_copy进行排序
  sort_num_copies();
  prec_result_list_sz := length(prec_result_list);   //现在, nums[0] <= nums[1] <= nums[2] ... <= nums[15]
  //所以我们接下来的calc_prec_results_temp()调用的性能应该是很好的  //对 nums[0.. 7] 产生所有的组合的 sum 到 prec_results_temp_lo,
  //对 nums[8..15] 产生所有的组合的 sum 到 prec_results_temp_hi,
  //并对两个结果数组进行排序
  //开销 = $100 * log($100) * 2
  calc_prec_result_list256(@nums_copy[0], @prec_results_temp_lo); //quick sort
  {$ifdef debug}
  check_prec_result_bitmap(
          nums_copy,
          @prec_results_temp_lo[0],
          length(prec_results_temp_lo)
        );
  {$endif}
  calc_prec_result_list256(@nums_copy[8], @prec_results_temp_hi); //quick sort
  {$ifdef debug}
  check_prec_result_bitmap(
          copy_nums(nums_copy, 8, 8),
          @prec_results_temp_hi[0],
          length(prec_results_temp_hi)
        );
  {$endif}  //至此，有
  //with prec_results_temp_lo:
  //   [0].value <= [1].value <= [2].value ... <= [255].value
  //对prec_results_temp_hi亦有:
  //   [0].value <= [1].value <= [2].value ... <= [255].value  getmem(prec_result_list_temp, sizeof(prec_result_list_temp^));
  try
    //接下来，我们再从这两个256个可能的值中组合产生65536个16位值
    p_dest := @prec_result_list[0]; //结果放在这个大数组中                                      (*****************************************
                                       * TODO:
                                       * use xmm to accelerate the addition
                                       * operation
                                       *****************************************)
    for iter_hi := low(prec_results_temp_hi) to high(prec_results_temp_hi) do
    begin
      p_hi := @prec_results_temp_hi[iter_hi];      for iter_lo := low(prec_results_temp_lo) to high(prec_results_temp_lo) do
      begin
        p_lo := @prec_results_temp_lo[iter_lo];        p_dest^.value := p_hi^.value + p_lo^.value;
        p_dest^.bitmap := (p_hi^.bitmap shl 8) + p_lo^.bitmap;        {$ifdef debug}
        check_bitmap(nums_copy, p_dest^.bitmap, p_dest^.value);
        {$endif}        inc(p_dest);
      end;
    end;    //因为前面的prec_results_temp_lo和prec_results_temp_hi都是已经有序的
    //所以
    //  for i := 0 to 255-1，恒有
    //    prec_result_list[i]  <=  prec_result_list[i+1]
    //
    //  for i := 256 to 511-1, 有
    //    prec_result_list[i]  <=  prec_result_list[i+1]
    //
    //依次类推，整个prec_result_list以256个记录为组分组，都有这些规律
    //
    //那么，可以在这个基础上进行归并排序，每相邻的两个分组作一次归并，得到一个更大
    //的512数组，再归并，直到最后，两个32768个记录的数组归并成一个65536个记录
    //的数组，排序完毕
    //也可以对折归并，对折时，左边的数据比右边数据小的情况比较多，merge过程的比
    //较次数就会相对减少，所以对折归并比较有利    j := $200; //最开始，每次合并的覆盖面为两个$100
    prec_result_list_src := @prec_result_list;
    prec_result_list_dst := prec_result_list_temp; //第1次: merge的结果在prec_result_list_temp    dest_is_temp := true;    {
      对折归并
      以下j循环的总共开销为
        8 * $10 000 =  $80 000 次 (比较+存储)
        +
        8 * 128 = 1024 次 (函数调用)
    }
    while j <= $10000 do //合并的覆盖面小于等于$10000, 总共循环8次
    begin

      {
        以下i循环开销为
        ($10000 div 2 div (j div 2)) * (j/2 + j/2) = $10000
      }
      i := 0;
      while i < $10000 div 2 do
      begin
        merge(
                @prec_result_list_src^[i],
                @prec_result_list_src^[i+cardinal($10000 div 2)],
                @prec_result_list_dst^[i*2],
                j div 2
              );
        inc(i, j div 2);
      end;//      //毗邻的数组两两归并
//      i := 0;
//      while i < $10000 do
//      begin
//        merge(
//                @prec_result_list_src^[i],
//                @prec_result_list_src^[i+j div 2],
//                @prec_result_list_dst^[i],
//                j div 2
//              );
//        inc(i, j);
//      end;      j := j * 2; //覆盖面增大1倍      //交换源/目标数组
      swap := prec_result_list_src;
      prec_result_list_src := prec_result_list_dst;
      prec_result_list_dst := swap;

      if j <= $10000 then
        dest_is_temp := not dest_is_temp;
    end;    if dest_is_temp then
      move(
              prec_result_list_temp^,
              prec_result_list,
              sizeof(prec_result_list)
            );
    //第1次: merge的结果在: in prec_result_list_temp
    //2nd: prec_result_list
    //3: prec_result_list_temp
    //4: prec_result_list
    //5: prec_result_list_temp
    //6: prec_result_list
    //7: prec_result_list_temp
    //8: prec_result_list <<最后一次    //所以，至此需要的结果已经准备好在prec_result_list了    //释放prec_result_list_temp
  finally
    freemem(prec_result_list_temp);
  end;  {$ifdef debug}
  //验证结果数组的次序性
  verify_order_le(@prec_result_list[0], length(prec_result_list));
  //验证结果数组的位组合是否与他的和相对应
  check_prec_result_bitmap(nums_copy, @prec_result_list[0], length(prec_result_list));
  {$endif}
end;procedure t_pack_searcher.prepare_middle;
begin
  prec_result_list_sz := $100;  calc_prec_result_list256(@nums_copy, @prec_result_list);
end;{
  pre-calculate 8 nums all posible sum of 8 nums combination
  nums_copy_index:  the first num index of nums_copy
  prec_8_arr_index: the result sum will put in prec_8_arr[prec_8_arr_index][]
}
procedure t_pack_searcher.prepare_prec_8_arr(
        const nums_copy_index: integer;
        const prec_8_arr_index: integer
      );
var
  i, j: integer;
  temp: cardinal;
begin
  for i := low(byte) to high(byte) do
  begin
    temp := 0;
    for j := 0 to bits_per_byte-1 do
      if (i and bits[j]) <> 0 then
        inc(temp, nums_copy[nums_copy_index + j]);

    prec_8_arr[prec_8_arr_index][i] := temp;
  end;
end;

//二分查找
function t_pack_searcher.scan2(const base_value: cardinal): cardinal;
var
  i, min_index, max_index: integer;
  temp: cardinal;
begin
  min_index := 0;
  max_index := prec_result_list_sz - 1;  i := max_index shr 1;

  while true do
  begin
    temp := prec_result_list[i].value - base_value;
    {$ifdef trace}
    inc(scan_hit_count, 1);
    {$endif}    if temp = 0 then
    begin
      result := prec_result_list[i].bitmap;
      exit;
    end
    else if integer(temp) < 0 then
    begin
      if i = max_index then
        break;      min_index := i + 1;
      i := ((i + 1) + max_index) shr 1;
    end
    else // if temp > 0 then
    begin
      if i = min_index then
        break;      max_index := i - 1;
      i := (min_index + (i - 1)) shr 1;
    end;
  end;  result := scan_not_found; //not found
end;{
  对nums_copy进行排序，并且记录他们原来的次序
}
procedure t_pack_searcher.sort_num_copies;
var
  i: integer;
  sort_temp: array[0..39] of t_prec_result;
  pp: array[0..39] of p_prec_result;
begin
  for i := 0 to num_count-1 do
  begin
    sort_temp[i].value := nums_copy[i];
    sort_temp[i].bitmap := i;
    pp[i] := @sort_temp[i];
  end;  quicksort(
          pointer(@pp),
          0,
          num_count-1,
          prec_result_sort_compare_asc
        );  for i := 0 to num_count-1 do
  begin
    nums_copy[i] := pp[i]^.value;
    nums_order_map[i] := pp[i]^.bitmap;
  end;

  require_order_map := true;
end;class procedure t_pack_searcher.test_scan_proc;
var
  i: integer;
  obj: t_pack_searcher;
begin
  obj := t_pack_searcher(t_pack_searcher.newinstance());
  try
    for i := low(obj.prec_result_list) to high(obj.prec_result_list) do
    begin
      obj.prec_result_list[i].value := i;
      obj.prec_result_list[i].bitmap := i;
    end;    obj.prec_result_list_sz := length(obj.prec_result_list);    for i := low(obj.prec_result_list) to high(obj.prec_result_list) do
    begin
      obj.sum := i;
      if obj.scan2(i) = scan_not_found then
        raise exception.create('error');
    end;
  finally
    obj.destroy();
  end;end;
procedure print_byte_bitmap(const b: byte);
var
  i: integer;
begin
  for i := 0 to bits_per_byte-1 do
  begin
    if (b and bits[i]) <> 0 then
      write('1')
    else write('0');
  end;  write(' ');
end; procedure print_selection(const selection: t_pack_selection);
var
  i: integer;
begin
  for i := low(selection) to high(selection) do
    print_byte_bitmap(selection[i]);
end;

{$ifdef pack5_test}type
  t_bitmap_64 = record
    lo, hi: cardinal;
  end;
{ t_data_sample }
constructor t_data_sample.create(
        const a_no_result: boolean;
        const a_num_count: cardinal;
        const a_min_selection_count: integer);var
  i, rr_lo, rr_hi, bit_count, selection_sz: integer;
  bitmask_lo, bitmask_hi: cardinal;
  bitmap: t_bitmap_64;
  sum64, max_sum: int64;
begin
  assert((a_num_count > 0) and (a_num_count <= 40));  no_result := a_no_result;
  num_count := a_num_count;
  min_selection_count := a_min_selection_count;  randomize();  setlength(nums, a_num_count);  //the allowed sum is high(cardinal)-1
  rr_hi := maxint div integer(a_num_count);
  rr_lo := rr_hi div 4;  for i := low(nums) to high(nums) do
    nums[i] := randomrange(rr_lo, rr_hi);  bitmap.lo := 0;
  bitmap.hi := 0;  if not no_result then
  begin
    max_sum := high(cardinal);
    max_sum := max_sum - 1;    if num_count >= 32 then
      bitmask_lo := (not cardinal(0))
    else
      bitmask_lo := (not cardinal(0)) shr (32 - num_count);    if num_count > 32 then
      bitmask_hi := (not cardinal(0)) shr (8 - (num_count-32)) and $ff
    else bitmask_hi := 0;    repeat
      if min_selection_count < 31 then
        bitmap.lo := bitmap.lo or randomrange(1, maxint) and bitmask_lo
      else bitmap.lo := bitmask_lo;      if min_selection_count > 32 then
      begin
        if min_selection_count < 40 then
          bitmap.hi := bitmap.hi or randomrange(1, maxint) and bitmask_hi
        else bitmap.hi := bitmask_hi;
      end;      sum64 := 0;
      bit_count := 0;
      for i := low(nums) to high(nums) do
      begin
        if test_bit(@bitmap, i) then
        begin
          inc(sum64, nums[i]);
          inc(bit_count);
        end;
      end;
    until ((sum64 < max_sum) and (bit_count >= min_selection_count));    sum := sum64;
  end
  else
    //no result case
    sum := high(cardinal);  selection_sz := calc_selection_size(num_count);
  setlength(init_selection, selection_sz);

  for i := low(nums) to high(nums) do
    if test_bit(@bitmap, i) then
      set_bit(@init_selection[0], i);
end;
procedure t_data_sample.exec(const print_found_selection: boolean);
var
  searcher: t_pack_searcher;
  selection: t_pack_selection;
begin
  QueryPerformanceCounter(perf_cnt_begin);
  searcher := t_pack_searcher.create(no_result, sum, nums);
  try
    selection := searcher.exec();
  finally
    searcher.destroy();
  end;
  QueryPerformanceCounter(perf_cnt_end);  if verify(selection, false) and print_found_selection then
  begin
    write('found selection: '); print_selection(selection); writeln('');
  end;
end;function t_data_sample.perf_cnt_period_to_ms: double;
var
  freq: tlargeinteger;
begin
  QueryPerformanceFrequency(freq);
  result := (perf_cnt_end - perf_cnt_begin)  * 1000;
  result := result / freq;
end;function t_data_sample.verify(
        const selection: t_pack_selection;
        const print_nums: boolean): boolean;
var
  i, selected_count: integer;
  sum_verify: cardinal;  procedure print_error();
  begin
    writeln('******************************');
    writeln('ERROR: ');
    writeln('******************************');
  end;begin
  if is_empty_pack_selection(selection) then
  begin
    if no_result then
    begin
      result := true;
      exit;
    end
    else
    begin
      writeln('original sum is :', sum);
      write('original selection is: '); print_selection(init_selection); writeln('');
      print_error();
      pause();
      abort();
    end;
  end;  sum_verify := 0;
  selected_count := 0;
  for i := low(nums) to high(nums) do
  begin
    if test_selection(selection, i) then
    begin
      inc(sum_verify, nums[i]);
      inc(selected_count);      if print_nums then
        writeln('select ', i:2, 'th num: ', nums[i]:12);
    end;
  end;
  if print_nums then
    writeln('selected count: ', selected_count);  result := sum_verify = sum;
  if not result then
  begin
    print_error();
    writeln('the selected sum is :', sum_verify);
    writeln('original sum is :', sum);
    write('original selection is: '); print_selection(init_selection); writeln('');
    write('found selection is:    '); print_selection(selection); writeln('');
    pause();
    abort();
  end;
end;

type
  t_perf_count_collection = record
    expected_times, tested_times: integer;
    used_times: array of double;
  end;
  p_perf_count_collection = ^t_perf_count_collection;{ t_pack_tester }
class procedure t_pack_tester.full_test;
  procedure _exec_(
          const a_no_result: boolean;
          const a_num_count: cardinal;
          const a_min_selection_count: integer;
          perf_cnt_collect: p_perf_count_collection;
          const a_indent: integer
        );
  var
    sample: t_data_sample;
  begin
    sample := t_data_sample.create(
            a_no_result,
            a_num_count,
            a_min_selection_count
          );
    try
      if a_indent > 0 then
        indent(a_indent);

      sample.exec(perf_cnt_collect <> nil);      if perf_cnt_collect <> nil then
      begin
        if perf_cnt_collect^.tested_times < perf_cnt_collect^.expected_times then
        begin
          perf_cnt_collect^.used_times[perf_cnt_collect^.tested_times] :=
                  sample.perf_cnt_period_to_ms();          inc(perf_cnt_collect^.tested_times);
        end;
      end;
    finally
      sample.destroy();
    end;
  end;  procedure do_test(
          const no_result: boolean;
          const num_count: cardinal;
          perf_cnt_collect: p_perf_count_collection;
          const indent: integer
        );
  begin
    if no_result then
      _exec_(true, num_count, 0, perf_cnt_collect, indent)
    else
    begin
      if perf_cnt_collect = nil then
      begin
        _exec_(false, num_count, 1, perf_cnt_collect, indent);
        if num_count > 1 then
          _exec_(false, num_count, num_count, nil, indent);
        if num_count > 2 then
          _exec_(false, num_count, num_count-1, nil, indent);
      end;      if num_count > 5 then
        _exec_(false, num_count, num_count div 2, perf_cnt_collect, indent);      if num_count > 7 then
      begin
        _exec_(false, num_count, num_count div 2 - 1, perf_cnt_collect, indent);
        _exec_(false, num_count, num_count div 2 + 1, perf_cnt_collect, indent);
      end;      if num_count > 9 then
      begin
        _exec_(false, num_count, num_count div 2 - 2, perf_cnt_collect, indent);
        _exec_(false, num_count, num_count div 2 + 2, perf_cnt_collect, indent);
      end;
    end;
  end;  procedure init_pcc(const expected_times: integer; var pcc: t_perf_count_collection);
  begin
    pcc.expected_times := expected_times;
    pcc.tested_times := 0;
    if length(pcc.used_times) <> expected_times then
      setlength(pcc.used_times, expected_times);
  end;  procedure avg_of_pcc(
          const pcc: t_perf_count_collection;
          var max, min, avg: double);
  var
    i: integer;
    t: double;
  begin
    max := 0;
    min := MaxInt;
    avg := 0;
    for i := 0 to pcc.tested_times - 1 do
    begin
      t := pcc.used_times[i];
      avg := avg + t;
      if t > max then
        max := t;
      if t < min then
        min := t;
    end;    avg := avg / pcc.tested_times;
  end;  function bool2str(b: boolean): string;
  const b_str: array[boolean] of string = ('false', 'true');
  begin
    result := b_str[b];
  end;  procedure print_task_title(const title: string);
  begin
    writeln('######################');
    writeln('TASK: ', title);
    writeln('######################');
  end;  procedure test_correctness();
  var
    i: integer;  begin
    print_task_title('test correctness');
    for i := 1 to max_num_count do
    begin
      indent(2); write('num count: ', i:2, ', no result = ', bool2str(true));
      do_test(true, i, nil, 0);
      do_test(true, i, nil, 0);
      do_test(true, i, nil, 0);
      do_test(true, i, nil, 0);
      writeln('  OK');      indent(2); write('num count: ', i:2, ', no result = ', bool2str(false));
      do_test(false, i, nil, 0);
      do_test(false, i, nil, 0);
      do_test(false, i, nil, 0);
      do_test(false, i, nil, 0);
      writeln(' OK');
    end;
    writeln('');
  end;  procedure test_performance();    procedure _p_(
            const no_result: boolean;
            const num_count: integer);
    const
      times = 4;
    var
      i: integer;
      pcc: t_perf_count_collection;
      max, min, avg: double;
    begin
      init_pcc(times * 32, pcc);      indent(4); writeln('no result = ', bool2str(no_result));      for i := 1 to times do
      begin
        do_test(no_result, num_count, @pcc, 6);
        if no_result then
          do_test(no_result, num_count, @pcc, 6);
      end;      avg_of_pcc(pcc, max, min, avg);      indent(6); writeln('tested ', pcc.tested_times, ' pass(es)');
      indent(6); writeln('max used time(ms):', formatfloat('0.0000', max));
      indent(6); writeln('min used time(ms):', formatfloat('0.0000', min));
      indent(6); writeln('avg used time(ms):', formatfloat('0.0000', avg));
      writeln('');
    end;    procedure p(const num_count: integer);
    begin
      indent(2); writeln('num count = ', num_count:2);
      indent(2); writeln('---------------------');
      _p_(true, num_count);
      _p_(false, num_count);
      writeln('');
    end;  var
    i: integer;
  begin
    print_task_title('test performance');
    i := 8;
    while i <= max_num_count do
    begin
      p(i-1);
      p(i);
      inc(i, 8);
    end;
  end;

begin
  writeln('+=======================================+');
  writeln(' pack FULL TEST');
  writeln('+=======================================+');  t_pack_searcher.test_scan_proc();  test_correctness();
  {$ifdef perf_meas}
  test_performance();
  {$endif}  writeln('*****************************************');
  writeln('');
  writeln('+=======================================+');
  writeln(' FINISH');
  writeln('+=======================================+');
  writeln('');
  writeln('');
  pause();
end;
{$endif}end.

呃，注释少了点是吧，t_pack_searcher是用类，用法很简单，先create对象，然后exec()就行了，返回一个字节数组，表示对参数nums的选择，如果返回的字节数组全部为0，则表示没有找到解t_pack_tester包含了一个类方法用来测试t_pack_searcher的正确性和性能前面我说的问题我现在想明白了，估计就是预计算和排序占用的时间接近0.015秒，而搜索过程则不费什么时间，所以24个数和32个数的耗时非常接近

基本上看明白了些，
时间复杂度基本上从 O(n^2) 降到了 O(log2 N) 了

调试易

拆数程序的后续

解决方案 »