应 alphax 的要求,提出我的最终代码原贴 http://community.csdn.net/Expert/topic/4515/4515392.xml?temp=.245907我的代码今天早上发现了些错误,所以重新改写了iNumberCount 是数的个数
ArrNumbers 是存放数的数组function ChaishuPlus: integer;
type
tByteArray = record
Bytes: array[0..3] of byte;
end;
var
i, j, k, t, s: DWORD;
aTemp: array [0..3] of array [0..$ff] of integer;
begin
//预计算
ZeroMemory(@aTemp, SizeOf(aTemp));
for i:= 0 to 3 do begin // 每个DWORD 4个字节
for j:= 0 to $ff do begin // 每个字节 256种可能
for k:= 0 to 7 do begin // 每个字节 8 位
if (1 shl k) and j <> 0 then begin
aTemp[i][j]:= aTemp[i][j] + ArrNumbers[i*8+k+1];
end;
end;
end;
end;
Result:= 0;
// 防止数组为空
if iNumberCount=0 then
s:= 0
else
s:= (1 shl iNumberCount-1) -1;
// 开始拆解计算
for i:= 1 to s do begin
t:= aTemp[0][tByteArray(i).Bytes[0]] + aTemp[1][tByteArray(i).Bytes[1]] +
aTemp[2][tByteArray(i).Bytes[2]] + aTemp[3][tByteArray(i).Bytes[3]] - sum; // (1)
if t=0 then begin // 这个写法是为了优化(2)
Result:= i;
break;
end;
end;
end;Result 的各个bit位,如果为1 就代表 相应的ArrNumbers 是结果。
这个就是我的最终代码。基本上抄袭了alphax 原来的算法,
只是稍微扩展了一下成了32个数的(原来写的64个数的有点问题),他使用空间换时间的思想,空间其实也不大,才4*256*4=4K字节,
但是alphax说 把预计算从8bit变成16bit能快1/5,参看汇编代码之后对此表示一点怀疑。下面提一下 (1)和(2)部分
ArrNumbers 是存放数的数组function ChaishuPlus: integer;
type
tByteArray = record
Bytes: array[0..3] of byte;
end;
var
i, j, k, t, s: DWORD;
aTemp: array [0..3] of array [0..$ff] of integer;
begin
//预计算
ZeroMemory(@aTemp, SizeOf(aTemp));
for i:= 0 to 3 do begin // 每个DWORD 4个字节
for j:= 0 to $ff do begin // 每个字节 256种可能
for k:= 0 to 7 do begin // 每个字节 8 位
if (1 shl k) and j <> 0 then begin
aTemp[i][j]:= aTemp[i][j] + ArrNumbers[i*8+k+1];
end;
end;
end;
end;
Result:= 0;
// 防止数组为空
if iNumberCount=0 then
s:= 0
else
s:= (1 shl iNumberCount-1) -1;
// 开始拆解计算
for i:= 1 to s do begin
t:= aTemp[0][tByteArray(i).Bytes[0]] + aTemp[1][tByteArray(i).Bytes[1]] +
aTemp[2][tByteArray(i).Bytes[2]] + aTemp[3][tByteArray(i).Bytes[3]] - sum; // (1)
if t=0 then begin // 这个写法是为了优化(2)
Result:= i;
break;
end;
end;
end;Result 的各个bit位,如果为1 就代表 相应的ArrNumbers 是结果。
这个就是我的最终代码。基本上抄袭了alphax 原来的算法,
只是稍微扩展了一下成了32个数的(原来写的64个数的有点问题),他使用空间换时间的思想,空间其实也不大,才4*256*4=4K字节,
但是alphax说 把预计算从8bit变成16bit能快1/5,参看汇编代码之后对此表示一点怀疑。下面提一下 (1)和(2)部分
for I := 1 to $00FFFFFF do
begin
temp0 := precarr[0][tbits(I).bytearray[0]];
temp1 := precarr[1][tbits(I).bytearray[1]];
temp2 := precarr[2][tbits(I).bytearray[2]]; temp0 := temp0 + temp1 + temp2; // (3) if abs(temp0 - sum) < 100 then // (4)
begin
Result := I;
Exit;
end;
end;其中(4)的地方 if abs(temp0 - sum) < 100 then 完全可以改成 temp0 - sum = 0 then
因为是正数运算了,那个误差没必要考虑了,计算前浮点属化整数的过程我已经处理了精度。
按照这个修改 我的代码应该 t:= aTemp[0][tByteArray(i).Bytes[0]] + aTemp[1][tByteArray(i).Bytes[1]] +
aTemp[2][tByteArray(i).Bytes[2]] + aTemp[3][tByteArray(i).Bytes[3]];
if t=sum then begin因为这个是循环的主体,所以调试的时候查看了汇编了代码,发现了一个问题(编译优化已经打开)看循环体部分的汇编代码00459569 |. 76 5B JBE SHORT TakeApar.004595C6
0045956B |. C745 FC 010000>MOV DWORD PTR SS:[EBP-4],1 t:= aTemp[0][tByteArray(i).Bytes[0]] + aTemp[1][tByteArray(i).Bytes[1]] +
aTemp[2][tByteArray(i).Bytes[2]] + aTemp[3][tByteArray(i).Bytes[3]];
00459572 |> 0FB645 FC /MOVZX EAX,BYTE PTR SS:[EBP-4]
00459576 |. 8BB485 F8EFFFF>|MOV ESI,DWORD PTR SS:[EBP+EAX*4-1008]
0045957D |. 0FB645 FD |MOVZX EAX,BYTE PTR SS:[EBP-3]
00459581 |. 03B485 F8F3FFF>|ADD ESI,DWORD PTR SS:[EBP+EAX*4-C08]
00459588 |. 0FB645 FE |MOVZX EAX,BYTE PTR SS:[EBP-2]
0045958C |. 03B485 F8F7FFF>|ADD ESI,DWORD PTR SS:[EBP+EAX*4-808]
00459593 |. 0FB645 FF |MOVZX EAX,BYTE PTR SS:[EBP-1]
00459597 |. 03B485 F8FBFFF>|ADD ESI,DWORD PTR SS:[EBP+EAX*4-408] if sum=t then begin
0045959E |. 8B45 F8 |MOV EAX,DWORD PTR SS:[EBP-8]
004595A1 |. 8B80 5C040000 |MOV EAX,DWORD PTR DS:[EAX+45C]
004595A7 |. 99 |CDQ
004595A8 |. 52 |PUSH EDX
004595A9 |. 50 |PUSH EAX
004595AA |. 8BC6 |MOV EAX,ESI
004595AC |. 33D2 |XOR EDX,EDX
004595AE |. 3B5424 04 |CMP EDX,DWORD PTR SS:[ESP+4]
004595B2 |. 75 03 |JNZ SHORT TakeApar.004595B7
004595B4 |. 3B0424 |CMP EAX,DWORD PTR SS:[ESP]
004595B7 |> 5A |POP EDX
004595B8 |. 58 |POP EAX
004595B9 |. 75 05 |JNZ SHORT TakeApar.004595C0 Result:= i;
004595BB |. 8B5D FC |MOV EBX,DWORD PTR SS:[EBP-4] break;
004595BE |. EB 06 |JMP SHORT TakeApar.004595C6会发现这个 if sum=t then begin 居然翻出这么一大堆东西,天知道编译器(Delphi2005)怎么优化的于是乎换了个思路改成
t:= aTemp[0][tByteArray(i).Bytes[0]] + aTemp[1][tByteArray(i).Bytes[1]] +
aTemp[2][tByteArray(i).Bytes[2]] + aTemp[3][tByteArray(i).Bytes[3]] - sum;
if t=0 then begin
Result:= i;
break;
end;这次出来的代码是我想要的了
t:= aTemp[0][tByteArray(i).Bytes[0]] + aTemp[1][tByteArray(i).Bytes[1]] +
aTemp[2][tByteArray(i).Bytes[2]] + aTemp[3][tByteArray(i).Bytes[3]] - sum;
00459570 |> 0FB64D FC /MOVZX ECX,BYTE PTR SS:[EBP-4]
00459574 |. 8B8C8D F8EFFFF>|MOV ECX,DWORD PTR SS:[EBP+ECX*4-1008]
0045957B |. 0FB65D FD |MOVZX EBX,BYTE PTR SS:[EBP-3]
0045957F |. 038C9D F8F3FFF>|ADD ECX,DWORD PTR SS:[EBP+EBX*4-C08]
00459586 |. 0FB65D FE |MOVZX EBX,BYTE PTR SS:[EBP-2]
0045958A |. 038C9D F8F7FFF>|ADD ECX,DWORD PTR SS:[EBP+EBX*4-808]
00459591 |. 0FB65D FF |MOVZX EBX,BYTE PTR SS:[EBP-1]
00459595 |. 038C9D F8FBFFF>|ADD ECX,DWORD PTR SS:[EBP+EBX*4-408]
0045959C |. 8B5D F8 |MOV EBX,DWORD PTR SS:[EBP-8]
0045959F |. 2B8B 5C040000 |SUB ECX,DWORD PTR DS:[EBX+45C] if t=0 then begin // 这个才看着正常本来就是一个比较嘛
004595A5 |. 85C9 |TEST ECX,ECX
004595A7 |. 75 05 |JNZ SHORT TakeApar.004595AE Result:= i;
004595A9 |. 8B55 FC |MOV EDX,DWORD PTR SS:[EBP-4] break;
004595AC |. EB 06 |JMP SHORT TakeApar.004595B4改了之后,32个数,无解的情况下 Pentium M 1.6 从96秒降到了 67秒。不知道是不是还有优化的办法。
alphaX,在你的机器看一看是不是也这样?另外,Delphi自带的CPU View怎么段拷贝?我这是用的OllyDbg反汇编的。
begin
temp0 := precarr[0][tbits(I).bytearray[0]];
temp1 := precarr[1][tbits(I).bytearray[1]];...同时,在循环体内的可用寄存器又增加了,编译器优化的自由度就提高了,
不过,如果预计算变成16bit,bytearray就增大了,一级高速缓存的命中率会低于8bit预计算的情况
>>Delphi自带的CPU View怎么段拷贝
用鼠标指向需要复制的第一行,按下鼠标托拽到结束行,松开,右击得到弹出菜单,点Copy
用鼠标指向需要复制的第一行,按下鼠标托拽到结束行,松开,右击得到弹出菜单,点Copyd2006是这样,d2005不知道,如果不行可以装一个expert(vcl原码刨析的作者做了一个,书在家里,忘了作者名了)
拆数代码我又作了优化,代码见下,主要是作标量替换和loop unrollingP4 1.8G, 32个数无解的情况下, 6秒以内返回(D2006编译)
如果--"计算前浮点属化整数的过程我已经处理了精度",应该还可以再快代码全部加了//,以保留格式//program Project14;
//
//{$APPTYPE CONSOLE}
//
//{.$define num23}
//{.$define pentium3}
//
//uses
// SysUtils, Windows, Math;
//
//
// //return the selection, 0 for not found
// function p4_2(
// const sum, mistake: Integer;
// const nums: array of integer;
// const noresult: boolean): Integer;
// label 1;
// type
// tprec8 = array[byte] of integer;
// pprec8 = ^tprec8;
// tbits = record
// bytearray: array[0..3] of byte;
// end;
// var
// count, bitmask: integer;
// i, j, k, l, m: Integer;
// si, sj, sk, temp, sum_min, sum_max: Integer;
// precarr: array[0..3] of tprec8;
// {$IFDEF pentium3}
// prec8_ptr: pprec8;
// {$ENDIF}
// prec8_1_ptr: pprec8;
//
// pint: pinteger;
//
// procedure precalc();
// var
// bits: array[0..7] of byte;
// b: byte;
// i, j, temp: Integer;
// pIntArr: PIntegerArray;
// begin
// for j := 0 to 7 do
// bits[j] := 1 shl j;
//
// pIntArr := @nums[0];
// for i := 0 to 3 do
// begin
// for b := low(b) to high(b) do
// begin
// temp := 0;
// for j := 0 to 7 do
// begin
// if (b and bits[j]) > 0 then
// inc(temp, pIntArr[j]);
// end;
//
// precarr[i][b] := temp;
// end;
//
// inc(PInteger(pIntArr), 8);
// end;
// end;
//
// procedure checknums();
// var
// i: integer;
// sum: Int64;
// begin
// sum := 0;
// for i := low(nums) to high(nums) do
// sum := sum + nums[i];
//
// if sum > MaxInt then
// raise Exception.Create('invalid paramter');
// end;
//
// begin
// count := Length(nums);
// Assert((count > 0) and (count <= 32));
// Assert(sum > 0);
//
// checknums();
// precalc();
//
//
//
// if count = 32 then
// bitmask := not 0
// else if count = 31 then
// bitmask := not $80000000
// else bitmask := 1 shl count - 1;
//
// sum_min := sum - mistake;
// sum_max := sum + mistake;
//
// prec8_1_ptr := @precarr[1];
//
// for i := tbits(bitmask).bytearray[3] downto 0 do
// begin
// si := precarr[3][i];
// for j := tbits(bitmask).bytearray[2] downto 0 do
// begin
// sj := si + precarr[2][j];
// for k := tbits(bitmask).bytearray[1] downto 0 do
// begin
// sk := sj + precarr[1][k];
// {$IFDEF pentium3}
// prec8_ptr := @precarr[0];
// for L := $FF downto 0 do
// begin
// temp := sk + prec8_ptr[L];
// if (temp > sum_min) and (temp < sum_max) then
// goto 1;
// end;
// asm
// prefetcht0 [offset prec8_1_ptr[k-1]]
// end;
// {$ELSE}
// pint := @precarr[0][$ff];
// for L := $1f downto 0 do
// begin
// temp := sk + pint^;
// if (temp > sum_min) and (temp < sum_max) then
// goto 1;
// dec(pint);
//
// temp := sk + pint^;
// if (temp > sum_min) and (temp < sum_max) then
// goto 1;
// dec(pint);
//
// temp := sk + pint^;
// if (temp > sum_min) and (temp < sum_max) then
// goto 1;
// dec(pint);
//
// temp := sk + pint^;
// if (temp > sum_min) and (temp < sum_max) then
// goto 1;
// dec(pint);
//
// temp := sk + pint^;
// if (temp > sum_min) and (temp < sum_max) then
// goto 1;
// dec(pint);
//
// temp := sk + pint^;
// if (temp > sum_min) and (temp < sum_max) then
// goto 1;
// dec(pint);
//
// temp := sk + pint^;
// if (temp > sum_min) and (temp < sum_max) then
// goto 1;
// dec(pint);
//
// temp := sk + pint^;
// if (temp > sum_min) and (temp < sum_max) then
// goto 1;
// dec(pint);
// end;
// {$ENDIF}
// end;
// end;
// end;
//
// Result := 0;
// Exit;
//
// 1:
// {$IFDEF pentium3}
// //the L is predicately, because we reference it in the inner loop body
// m := L;
// {$ELSE}
// //because the L value is inpredicatable in the inner loop(due to the optimization),
// //so we must calculate it iva the value of pint
// m := $ff - (integer(pointer(@precarr[0][$ff])) - integer(pint)) div sizeof(integer);
// {$ENDIF}
//
// //i,j,k is predicately
// result := (i shl 24) or (j shl 16) or (k shl 8) or m;
// end;
// procedure print_result(sum, mistake, r: integer; const nums: array of integer);
// var
// i, len, count, sum_verify: integer;
// begin
// len := Length(nums);
// count := 0;
// sum_verify := 0;
// for i := 0 to len - 1 do
// if (r and (1 shl i)) > 0 then
// begin
// Writeln('nums ', i:2, ': ', nums[i]:12);
// Inc(sum_verify, nums[i]);
// inc(count);
// end;
//
// Writeln('selection count: ', count);
// if abs(sum_verify - sum) >= mistake then
// begin
// WriteLn('*****************************');
// Writeln('ERROR');
// WriteLn('*****************************');
// end;
// end;
//
//
// type
// tdata = record
// sum: integer;
// mistake: integer;
// nums: array of integer;
// end;
//
//
// procedure make_data(const noresult: boolean; out data: tdata);
// {$IFNDEF num23}
// var
// i: integer;
// {$ENDIF}
// begin
// with data do
// begin
// sum := 743460984;
// mistake := 100;
//
// {$IFDEF num23}
// SetLength(nums, 23);
// {$ELSE}
// SetLength(nums, 32);
// {$ENDIF}
//
// nums[0] := 78539724;
// nums[1] := 11989508;
// nums[2] := 16924851;
// if noresult then
// nums[3] := 49903571
// else nums[3] := 49903471; //
// nums[4] := 11989508;
// nums[5] := 13619134;
// nums[6] := 6792615;
// nums[7] := 159109415; //
// nums[8] := 5457709; //
// nums[9] := 10030376;
// nums[10] := 9932054; //
// nums[11] := 20060777; //
// nums[12] := 12329881; //
// nums[13] := 23907053; //
// nums[14] := 20318391; //
// nums[15] := 20318391; //
// nums[16] := 55625016; //
// nums[17] := 64549082; //
// nums[18] := 37358893; //
// nums[19] := 39924860; //
// nums[20] := 38673204; //
// nums[21] := 45567426; //
// nums[22] := 140425361; //
//
// {$IFNDEF num23}
// Randomize();
// for I := 23 to 31 do
// nums[23] := RandomRange(10000000, 20000000);
// {$ENDIF}
// end;
// end;
//
//
// procedure test(noresult: boolean);
// var
// data: tdata;
// r: integer;
// li_freq, li_begin, li_end: TLargeInteger;
// msecs: double;
// begin
// if noresult then
// Writeln('case: noresult = true')
// else Writeln('case: noresult = false');
// make_data(noresult, data);
// if not QueryPerformanceFrequency(li_freq) then
// RaiseLastOSError();
// QueryPerformanceCounter(li_begin);
// r := p4_2(data.sum, data.mistake, data.nums, noresult);
// QueryPerformanceCounter(li_end);
//
// msecs := (li_end - li_begin) * 1000 / li_freq;
// Writeln('use time(milli seconds): ', FormatFloat('0.00', msecs));
// Writeln('selection=', inttohex(r, 8));
// if r <> 0 then
// print_result(data.sum, data.mistake, r, data.nums);
// Writeln('');
//
// if noresult and (r <> 0) then
// test(true);
// end;
//
// procedure pause();
// var
// s: string;
// begin
// Writeln('press ENTER to continue...');
// Readln(s);
// end;
//
//
//begin
// test(true);
// test(false);
// pause();
//end.
if sum=t then begin|. 8B45 F8 |MOV EAX,DWORD PTR SS:[EBP-8]
|. 8B80 5C040000 |MOV EAX,DWORD PTR DS:[EAX+45C] //取sum
|. 99 |CDQ //带符号将sum扩展成双字
|. 52 |PUSH EDX
|. 50 |PUSH EAX //将双字推到stack|. 8BC6 |MOV EAX,ESI //扩展t, 低位在eax
|. 33D2 |XOR EDX,EDX //高位在edx, 为0|. 3B5424 04 |CMP EDX,DWORD PTR SS:[ESP+4] //比较先前push的edx
|. 75 03 |JNZ SHORT TakeApar.004595B7
|. 3B0424 |CMP EAX,DWORD PTR SS:[ESP] //比较先前push的eax
|> 5A |POP EDX //平衡stack
|. 58 |POP EAX //平衡stack
|. 75 05 |JNZ SHORT TakeApar.004595C0 //如果同则....看起来应该是一个integer和cardinal,进行比较,很可能你的t是cardinal,而sum是integer,在这种情况下,编译是正确的后来,你先减sum然后在测试是否为0,代码就简化了,因为加减法实质上是无符号的,而测试0就一条指令
我再看看,谢谢。D2006的确能拷贝CPU VIEW了。D2005以前的还是不行。
改成:
procedure precalc();
var
bits: array[0..7] of byte;
b: byte;
i, j, temp: Integer;
pIntArr: PIntegerArray;
temp_nums: array[0..31] of Integer;
begin
for j := 0 to 7 do
bits[j] := 1 shl j; fillchar(temp_nums[0], sizeof(temp_nums), 0); for i := 0 to count-1 do
temp_nums[i] := nums[i]; pIntArr := @temp_nums[0];
for i := 0 to 3 do
begin
for b := low(b) to high(b) do
begin
temp := 0;
for j := 0 to 7 do
begin
if (b and bits[j]) > 0 then
inc(temp, pIntArr[j]);
end; precarr[i][b] := temp;
end; inc(PInteger(pIntArr), 8);
end;
end;
应该可以了另外,我刚才下班后又测试了另外一个思路,应该可以思路是:
把最内层的预计算排序一下
然后从预计算的结果中进行二分查找
//最内层的预计算结果
t_prec_result = class
value: integer; //计算的值
bitmask: byte; //L的值
end; t_scan_table = class
public
sum: integer;
list: tlist;
procedure sortASC(); //排序 function Scan(const aBaseValue: integer): t_prec_result; //二分查找
end;
precarr: array[0..3] of tprec8;
变成
precarr: array[0..2] of tprec8;
预计算变成
var
temp_nums: array[0..31] of integer;
pIntArr := @temp_nums[8]; //8..31, 放在precarr[0..2]
for i := 0 to 2 do
begin
for b := low(b) to high(b) do
begin
temp := 0;
for j := 0 to 7 do
begin
if (b and bits[j]) <> 0 then
inc(temp, pIntArr[j]);
end; precarr[i][b] := temp;
end; inc(PInteger(pIntArr), 8);
end; //0..7, 放在scan_table内
for b := low(b) to high(b) do
begin
temp := 0;
for j := 0 to 7 do
begin
if (b and bits[j]) <> 0 then
inc(temp, temp_nums[j]);
end; pr := t_prec_result.Create();
pr.value := temp;
pr.bitmask := b;
scan_table.list.Add(pr);
end;
scan_table.sortASC(); //按照pr.value进行升序排序
主要循环就变成
for i := tbits(bitmask).bytearray[3] downto 0 do
begin
si := precarr[2][i];
for j := tbits(bitmask).bytearray[2] downto 0 do
begin
sj := si + precarr[1][j];
for k := tbits(bitmask).bytearray[1] downto 0 do
begin
sk := sj + precarr[0][k];
prec_result := scan_table.scan(sk);
if prec_result <> nil then
begin
result := prec_result.bitmask or (i shl 24) or (j shl 16) or (k shl 8);
exit;
end;
end;
end;
end;Scan的代码, function t_scan_table.Scan(const aBaseValue: integer): t_prec_result;
var
i, min_index, max_index, temp: integer;
{$ifdef debug_and_trace}
next_index: integer;
{$endif}
ptrlist: PPointerList;
begin
max_index := list.count-1;
min_index := 0; ptrlist := list.List; i := max_index shr 1; while true do
begin
Result := t_prec_result(ptrlist[i]);
{$ifdef debug_and_trace}
_index_[i] := true;
inc(_count_);
{$endif} temp := (Result.value + aBaseValue); if (temp = sum) then
Exit //ok
else if temp > sum then
begin
if i = 0 then
Break; //exit max_index := i-1; {$ifdef debug_and_trace}
next_index := (min_index + max_index) shr 1;
Assert(next_index < i);
i := next_index;
{$else}
i := (min_index + max_index) shr 1;
{$endif}
if min_index > i then
Break; //exit
end
else //if (temp < sum) then
begin
if i = max_index then
Break; //exit min_index := i + 1; {$ifdef debug_and_trace}
next_index := (min_index + max_index+1) shr 1;
Assert(next_index > i);
i := next_index;
{$else}
i := (min_index + max_index + 1) shr 1;
{$endif}
end;
end; Result := nil;
end; 这样,比原来多了个排序开销,
但原来最内层的256次操作,通过二分查找法降低到最多8次,即使算上二分查找和调用的开销,
也应该低于原来的256次操作
我在这台P4 1.8G上测得32个数,无解, 1.3秒返回
本来想把最内两层的65536次操作,都通过二分查找法来作的,这样可以把操作从65536次降低到最多16次,
就远远快于原来的操作了,但是考虑到预先的排序过程确是比较慢的,65536个数排序最坏时复杂度
是$FFFFFFFF,这就得不偿失了
和我的相反有一些地方类似。
我也在考虑把拆数的大循环改称几个嵌套的循环,来减少4个数做加法消耗的时间,
我的想法:
1.上一个帖子有人提出过排序,然后计算出来至少和至多需要多少个数m和n。
2. 采用你的预计算方式,计算的同时统计与计算结果中每个结果中bit位1的个数;
3. 在循环的适当深度计算bit位上1的个数,如果大于n,或者即使下层循环全是1,1的个数也小于m,则往下循环的工作不用作了,直接Continue.这个可以理解为是剪枝的工作。
我没试过,不知道他增加开销和剪枝的效果是否值。不论如何,32个数无解的情况能降低到1.x秒,已经完全满足实际应用了。alphaX你看看这个想法可行不?
位计数和检查的开销为5时钟,并假设位计数和检查放在第3层(针对n >= 11 and n <= 13的情形,
对于n = 8的情况可以放在第2层,不过道理是一样的),即k循环层,
for i ...
for j ...
...
bit_count_j := bit_count_map[i] + bit_count_map[j];
for k ...
begin
...
bit_count_k := bit_count_j + bit_count_map[k];
if bit_count_k > n then
continue; scan(65536个结果中折二查找)
...
end //k
...增加的开销 > ($100 0000 * 5 = $500 0000)当n=12时,节省的开销 C(12, 24) * 300 = ($29 431C) * 300 = $305A A4D0
当n=11 or 13时, 节省 $2CA2 70C0原来的整体开销 ($100 0000 * 300) 加上排序后约为 $1 7000 0000
改进后为 $1 7000 0000 + $500 0000 - $3000 0000 = $1 4500 0000也就是,如果位计数和检查的开销为5的估计准的话,n=12时时间开销可能只能
降低到原来的88%,也就是1.13倍的提高,
而成本是必须为n>=11 and n<=13的情形维护另外一个程序版本。
但位计数和检查的开销为5的估计是非常乐观的,
k层内增加的操作有:访存一次,加法一次,减法一次,测试一次,这些操作
都假设寄存器不是很紧张,而且访存命中,但实际上在外3层循环里,
每层里都有计算,实际上很难有这么好的情形,
并且一直以来delphi的对循环体的寄存器分配是比较弱的,
增加位计数累加和检查,会明显增加delphi分配寄存器的压力,
上次你也看到了,就是你前面说的delphi怎么优化的那一段,
他为了腾出寄存器,竟然在循环的最内层push/pop,其实在外层腾出寄存器,
最内层就不必push/pop了,他却不知道他所产生的push/pop是G数量级的,
还有一个问题就是增加的判断:
if bit_count > n then
...
当bit_count > n成立时,就等于pentium的指令预取判断错误,执行就停顿,
必须重新取指,所以平均增加的开销很可能不是5,可能达到8甚至更多,
所以,在外层只有3层而最内层操作开销不是很大的情况下,
加插这个位数检查暂时可能不会有太大的益处,
如果外层有4层(48个数的情形),那么在第3层做这个处理是非常有必要的另外,我又改进了一下,原来考虑排序的算法是quick sort,
因为要确保程序在比较确定的时间内返回,所以没敢作16位排序,只做了8位的,
后来想到,可以在预计算时想办法确保结果值有序,然后用归并排序归并,
归并主体的复杂度约为 8n,即$8 0000次比较,这样就可以把最内2层,用二分查找来做了,我的最新测试,32位数无解情况下,p4 1.8,所花的时间0.02秒多一点
现已通过32位的正确性测试,现在作一下40位的测试并整理一下代码
简单的,撇开预计算和排序部分的开销,单就循环主体的开销来计算
主循环主体循环次数 = $100 0000如果在第三层进行处理, n = 11 时命中的次数(即bit_count > n)为 = C(12,24) + C(13,24) + ... + C(24,24)
不命中的次数为 = C(11,24) + C(10, 24) + ... + C(1,24)
= C(13,24) + C(14,24) + .. + C(24,24)命中-不命中=C(12,24)
c(12,24) = $29 431C需要执行的(不命中的)= ($100 0000 - C(12,24)) / 2 = $6B 5E72也就是循环主体的循环次数下降到原来的42% ($6b 5e72 / $100 0000)
第二层,n = 7 时命中的次数为 = C(8,16) + C(9,16) + ... + C(16,16)
不命中次数为 = C(7,16) + C(6,16) + ... + C(1,16)
= C(9,16) + ....
命中-不命中=C(8,16)
C(8,16) = 12870
不命中=($1 0000 - 12870) / 2 = 26333
循环次数下降到原来的40%
依此看来, 针对这些特定情形作带位计算处理的版本,还是能够得到2倍的性能提高的
我想了一下,>>3. 在循环的适当深度计算bit位上1的个数,如果大于n,
>>或者即使下层循环全是1,1的个数也小于m,
>>则往下循环的工作不用作了for i ...
for j ...
...
sum_i_j := prec_arr[2][i] + prec_arr[1][j];
for k ...
begin
...
sum_i_j_k := sum_i_j + prec_arr[0][k];对于n的判断,
实际上直接比较sum和sum_i_j_k就等价于判断i/j/k的位数数否大于n了,即
if sum_i_j_k > sum then
continue;而对于m的判断,可以预先计算nums[0..15]的和,得到sum_0_15,
将sum减去sum_0_15得到一个sum2然后同样在k层
if sum_i_j_k < sum2 then
continue;这样也等价于判断m,这样其实位计数和检查就简化了
现在40个数无解情况下竟然只要0.2秒不到,而32个数,24个数都要0.015,有点不解,不知道是否代码错了,但是测试又没发现问题
project文件
============================================
program Project15;{$APPTYPE CONSOLE}uses
SysUtils,
pack5 in 'pack5.pas';var
e: tobject;
begin
try
t_pack_tester.full_test();
except
e := ExceptObject();
if e is Exception then
writeln('ERROR: ', exception(e).Message)
else writeln('ERROR: (Unknown error)');
end;
end.
{.$define trace}
{$define pack5_test}{$ifdef pack5_test}
{$define perf_meas} //正式使用时,应关闭perf_meas
{$endif}
interface
uses
SysUtils, Classes, Math
{$ifdef pack5_test }
, Windows
{$endif}
;const
max_num_count = 40;
bits_per_byte = 8;
bits: array[0..bits_per_byte-1] of cardinal = (
1 shl 0,
1 shl 1,
1 shl 2,
1 shl 3,
1 shl 4,
1 shl 5,
1 shl 6,
1 shl 7
);
scan_not_found = $ffffffff;type
t_pack_selection = array of byte;type
t_prec_result = record
value: cardinal;
bitmap: cardinal;
end; //8 bytes
p_prec_result = ^t_prec_result; t_prec_result_list = array[0..($100 * $100-1)] of t_prec_result;
p_prec_result_list = ^t_prec_result_list; t_prec_result_list256 = array[low(byte)..high(byte)] of t_prec_result;
p_prec_result_list256 = ^t_prec_result_list256; t_8nums = array[0..bits_per_byte-1] of cardinal;
p_8nums = ^t_8nums; t_prec_8 = array[byte] of cardinal;
p_prec_8 = ^t_prec_8;
t_bits = array[0..3] of byte;
p_bits = ^t_bits;{$ifdef pack5_test}
t_data_sample = class
public
no_result: boolean;
init_selection: t_pack_selection;
sum: cardinal;
num_count: cardinal;
min_selection_count: integer;
nums: array of cardinal;
perf_cnt_begin, perf_cnt_end: tlargeinteger; constructor create(
const a_no_result: boolean;
const a_num_count: cardinal;
const a_min_selection_count: integer
); procedure exec(const print_found_selection: boolean);
function perf_cnt_period_to_ms(): double; function verify(const selection: t_pack_selection; const print_nums: boolean): boolean;
end;
t_pack_tester = class
public
class procedure full_test;
end;
{$endif pack5_test} t_pack_searcher = class
private (*** common private part ***)
num_count: integer;
sum: cardinal;
no_result: boolean;
nums_copy: array[0..39] of cardinal;
require_order_map: boolean; //for middle or large case
prec_result_list: t_prec_result_list;
prec_result_list_sz: cardinal; //指出原来的顺序,取nums_order_map[i],可以得到当前num_copy中第i个数的原来的序号
//仅当require_order_map时有意义
nums_order_map: array[0..39] of cardinal; prec_8_arr: array[0..2] of t_prec_8;
selection_sz: integer; procedure sort_num_copies(); {$ifdef debug}
procedure check_bitmap(
const nums: array of cardinal;
const bitmap: cardinal;
const value: cardinal);
procedure check_prec_result_bitmap(
const nums: array of cardinal;
p: p_prec_result;
count: integer
);
{$endif} procedure map_to_original_order(var selection: t_pack_selection); procedure make_empty_pack_selection(var selection: t_pack_selection); {
pre-calculate 8 nums all posible sum of 8 nums combination
nums_copy_index: the first num index of nums_copy
prec_8_arr_index: the result sum will put in prec_8_arr[prec_8_arr_index][]
}
procedure prepare_prec_8_arr(
const nums_copy_index: integer;
const prec_8_arr_index: integer
); procedure prepare_dummy_prec_8_arr(const prec_8_arr_index: integer); private (*** private for small case ***)
//for the case num_count < 16
function internal_exec_smallcase(): t_pack_selection; private (*** private for middle case ***)
//pre-calcuate the results for all posible nums[0]..nums[8] combination
//sort and store in prec_result_list
procedure prepare_middle;
//for the case num_count >= 16
function internal_exec_ge16(): t_pack_selection; private (*** private for large case ***)
//pre-calcuate the results for all posible nums[0]..nums[15] combination
//sort and store in prec_result_list
procedure prepare_large; //return value: low 16 bit selection
//return $ffffffff for not found
function scan2(const base_value: cardinal): cardinal; //for the case num_count >= 24
function internal_exec_ge24: t_pack_selection; public
constructor create(
const a_no_result: boolean;
const a_sum: cardinal;
const a_nums: array of cardinal
); class procedure test_scan_proc; function exec(): t_pack_selection;
end;function is_empty_pack_selection(const selection: t_pack_selection): boolean;
function calc_selection_size(const num_count: integer): integer;
procedure print_selection(const selection: t_pack_selection);function test_selection(
const selection: t_pack_selection;
const index: integer): boolean;
implementationprocedure pause();
var
s: string;
begin
Writeln('press ENTER to continue...');
Readln(s);
end;{$ifdef debug}
//print nums, and return the sum
function nums_sum(
const nums: array of cardinal;
const bitmap: cardinal;
const print_nums: boolean): cardinal;
var
i, max: integer;
begin
result := 0;
max := high(nums);
if max > 31 then
max := 31;
for i := low(nums) to max do
if bitmap and (1 shl i) <> 0 then
begin
inc(result, nums[i]);
if print_nums then
writeln('has_num(', i:2, '): ', nums[i]:12);
end;
end;procedure touch(var x);
begin
//do nothing
end;
{$endif}{$ifdef pack5_test}
procedure indent(spc_cnt: integer);
var
s: string;
begin
if spc_cnt > 0 then
begin
setlength(s, spc_cnt);
fillchar(s[1], spc_cnt, ' ');
write(s);
end;
end;
{$endif}function is_empty_pack_selection(
const selection: t_pack_selection
): boolean;
var
i: integer;
begin
result := false;
for i := low(selection) to high(selection) do
if selection[i] <> 0 then
exit; result := true;
end;
function calc_selection_size(const num_count: integer): integer;
begin
result := (num_count + (bits_per_byte-1)) div bits_per_byte;
end;function test_bit(const p: pointer; const index: integer): boolean;
var
byte_index, bit_index: integer;
begin
byte_index := index div 8;
bit_index := index mod 8; result := (pbytearray(p)[byte_index] and bits[bit_index]) <> 0;
end;function test_selection(
const selection: t_pack_selection;
const index: integer): boolean;
begin
{$ifdef debug}
assert(length(selection) > 0);
{$endif} result := test_bit(@selection[0], index);
end;
{$ifdef trace}
var
merge_move_count: integer;
scan_hit_count: integer;
{$endif}
type
t_cardinal_arr = array of cardinal;
function copy_nums(
const nums: array of cardinal;
const copy_start_index: integer;
const count: integer): t_cardinal_arr;
begin
assert(copy_start_index >= 0);
assert((copy_start_index + count) <= length(nums));
setlength(result, count);
if count > 0 then
move(nums[copy_start_index], result[0], count * sizeof(result[0]));
end;
{$endif}{ Copy from Borland.Classes.pas/implementation }
procedure QuickSort(SortList: PPointerList; L, R: Integer;
SCompare: TListSortCompare);
var
I, J: Integer;
P, T: Pointer;
begin
repeat
I := L;
J := R;
P := SortList^[(L + R) shr 1];
repeat
while SCompare(SortList^[I], P) < 0 do
Inc(I);
while SCompare(SortList^[J], P) > 0 do
Dec(J);
if I <= J then
begin
T := SortList^[I];
SortList^[I] := SortList^[J];
SortList^[J] := T;
Inc(I);
Dec(J);
end;
until I > J;
if L < J then
QuickSort(SortList, L, J, SCompare);
L := I;
until I >= R;
end;procedure set_bit(p: pointer; bit_index: integer);
var
byte_index: integer;
begin
byte_index := bit_index div 8;
bit_index := bit_index mod 8;
pbytearray(p)[byte_index] := pbytearray(p)[byte_index] or bits[bit_index];
end;
function nums_copy_sort_compare_asc(a, b: pointer): integer;
begin
if cardinal(a) = cardinal(b) then
result := 0
else if cardinal(a) > cardinal(b) then
result := 1
else result := -1;
end;function prec_result_sort_compare_asc(a, b: pointer): integer;
var
x, y: p_prec_result;
begin
x := a; y := b;
if x^.value = y^.value then
result := 0
else if x^.value > y^.value then
result := 1
else result := -1;
end;
procedure calc_prec_result_list256(
in_8nums_ptr: p_8nums;
out_prec_result_temp: p_prec_result_list256);
{
预计算8个数的各种可能的组合的和,并按和的大小升序排序
}
var
i, j: integer;
temp: cardinal;
p: p_prec_result;
ptr_array_ff: array[low(byte)..high(byte)] of p_prec_result;
buff: t_prec_result_list256;
begin
//预计算8个数的各种可能的组合的和
for i := low(byte) to high(byte) do
begin
temp := 0;
for j := 0 to bits_per_byte-1 do
if (i and bits[j]) <> 0 then
temp := temp + in_8nums_ptr[j]; p := @buff[i];
p^.value := temp;
p^.bitmap := i; ptr_array_ff[i] := p;
end;
//排序
quicksort(
@ptr_array_ff,
low(ptr_array_ff),
high(ptr_array_ff),
prec_result_sort_compare_asc
);
//now data in buff is sorted, copy to out_prec_result_temp
for i := low(buff) to high(buff) do
out_prec_result_temp[i] := ptr_array_ff[i]^;
end;procedure merge(x, y, result: p_prec_result; count: integer);
var
x_cnt, y_cnt: integer;
begin
x_cnt := count;
y_cnt := count; repeat
if x^.value <= y^.value then
begin
//put x
result^ := x^;
inc(x);
dec(x_cnt);
end
else
begin
//put y
result^ := y^;
inc(y);
dec(y_cnt);
end; inc(result);
until (x_cnt = 0) or (y_cnt = 0); if (x_cnt > 0) then
move(x^, result^, sizeof(x^) * x_cnt)
else if (y_cnt > 0) then
move(y^, result^, sizeof(y^) * y_cnt); {$if defined(trace)}
inc(merge_move_count, x_cnt + y_cnt);
{$ifend}
end;{$ifdef debug}
procedure verify_order_le(first_prec_result: p_prec_result; count: integer);
var
value: cardinal;
begin
value := 0;
while count > 0 do
begin
if value > first_prec_result^.value then
begin
asm int 3 end;
touch(first_prec_result); touch(count);
writeln('verify_order_le failed');
pause();
end; value := first_prec_result^.value;
inc(first_prec_result);
dec(count);
end;
end;
{$endif}
{$ifdef debug}
procedure t_pack_searcher.check_bitmap(
const nums: array of cardinal;
const bitmap, value: cardinal);
var
s: cardinal;
begin
s := nums_sum(nums, bitmap, false);
if s <> value then
begin
nums_sum(nums, bitmap, true);
asm
int 3
end;
writeln('check_bitmap failed, sum is ', s, ', expected is ', value);
pause();
end;
end;procedure t_pack_searcher.check_prec_result_bitmap(
const nums: array of cardinal;
p: p_prec_result;
count: integer);
begin
while count > 0 do
begin
check_bitmap(nums, p^.bitmap, p^.value);
dec(count);
inc(p);
end;
end;
{$endif debug}constructor t_pack_searcher.create(
const a_no_result: boolean;
const a_sum: cardinal;
const a_nums: array of cardinal);
var
i: integer;
sum_64: int64;
begin
sum := a_sum;
assert(not a_no_result or (sum > 0)); num_count := length(a_nums); assert(
(num_count > 0)
and
(num_count <= max_num_count)
); no_result := a_no_result; sum_64 := 0;
for i := low(a_nums) to high(a_nums) do
inc(sum_64, a_nums[i]); if sum_64 > (high(sum)-1) then //high(cardinal) is reserved for no_result test
raise exception.create('Invalid parameter, sum of nums is too large'); selection_sz := calc_selection_size(num_count); move(a_nums[0], nums_copy[0], length(nums_copy) * sizeof(a_nums[0]));
end;
function t_pack_searcher.exec(): t_pack_selection;
begin {$ifdef trace}
scan_hit_count := 0;
merge_move_count := 0;
{$endif}
if num_count >= 24 then
result := internal_exec_ge24()
else if num_count >= 16 then
result := internal_exec_ge16()
else
result := internal_exec_smallcase(); {$ifdef trace}
writeln('scan hit count: ', scan_hit_count);
writeln('merge_move_count: ', merge_move_count);
{$endif}
end;//for the case num_count >= 16
function t_pack_searcher.internal_exec_ge16: t_pack_selection;
var
i, j: integer;
bits: integer;
temp, value_i: cardinal;
scan_result, sum_0_7: cardinal; procedure make_result(use_scan_result: boolean);
begin
setlength(result, selection_sz);
if use_scan_result then
result[0] := scan_result and $00ff
else
Result[0] := 0; result[1] := j;
if selection_sz > 2 then
result[2] := i; //map_to_original_order(result);
end;begin
prepare_middle(); prepare_prec_8_arr( 8, 0);
if num_count > 16 then
prepare_prec_8_arr( 16, 1)
else prepare_dummy_prec_8_arr(1); bits := 1 shl (num_count - 8) - 1;
{ reserved 8 bits is processed by scan() procedure }
scan_result := scan_not_found; {$ifndef perf_meas}
sum_0_7 := 0;
for i := 0 to 7 do
sum_0_7 := sum_0_7 + nums_copy[i];
{$else}
//因为no_result时产生的sum为high(cardinal)-1
//所以下面的if temp > sum_0_7 then
//总是成立,这将影响性能测试的准确性
//所以应该设为
sum_0_7 := high(cardinal);
{$endif}
for i := t_bits(bits)[1] downto 0 do
begin
value_i := prec_8_arr[1][i];
for j := t_bits(bits)[0] downto 0 do
begin
temp := sum - (value_i + prec_8_arr[0][j]);
//即使 integer(temp) < 0
//也有 temp <= sum_0_7
if temp > sum_0_7 then
Continue;
if (temp = 0) then
begin
make_result(false);
exit;
end
else if (integer(temp) < 0) then
continue;
scan_result := scan2(temp);
if scan_result <> scan_not_found then
begin
make_result(true);
exit;
end;
end;
end; make_empty_pack_selection(result);
end;
function t_pack_searcher.internal_exec_ge24: t_pack_selection;
var
i, j, k: integer;
bits: integer;
temp, value_i, value_i_j: cardinal;
scan_result, sum_0_15: cardinal; procedure make_result(use_scan_result: boolean);
begin
setlength(result, selection_sz);
if use_scan_result then
begin
result[0] := scan_result and $00ff;
result[1] := (scan_result and $ff00) shr 8;
end
else
begin
result[0] := 0;
result[1] := 0;
end;
result[2] := k;
if selection_sz > 3 then
begin
result[3] := j;
if selection_sz > 4 then
result[4] := i;
end; map_to_original_order(result);
end;begin
//pre-calculate sums of 0..15
prepare_large(); //prec-calculate sums of 16..39
prepare_prec_8_arr( 16, 0); if num_count > 24 then
prepare_prec_8_arr( 24, 1)
else prepare_dummy_prec_8_arr(1); if num_count > 32 then
prepare_prec_8_arr( 32, 2)
else prepare_dummy_prec_8_arr(2); bits := 1 shl (num_count - 16) - 1;
{ reserved 16 bits is processed by scan() procedure } {$ifndef perf_meas}
sum_0_15 := 0;
for i := 0 to 15 do
sum_0_15 := sum_0_15 + nums_copy[i];
{$else}
//因为no_result时产生的sum为high(cardinal)-1
//所以下面的if temp > sum_0_15 then
//总是成立,这将影响性能测试的准确性
//所以应该设为
sum_0_15 := high(cardinal);
{$endif}
scan_result := scan_not_found; for i := t_bits(bits)[2] downto 0 do
begin
value_i := prec_8_arr[2][i];
for j := t_bits(bits)[1] downto 0 do
begin
value_i_j := value_i + prec_8_arr[1][j];
for k := t_bits(bits)[0] downto 0 do
begin
temp := sum - (value_i_j + prec_8_arr[0][k]); //即使 integer(temp) < 0
//也有 temp <= sum_0_15
if temp > sum_0_15 then
continue; if (temp = 0) then
begin
make_result(false);
exit;
end
else if integer(temp) < 0 then
continue; scan_result := scan2(temp);
if scan_result <> scan_not_found then
begin
make_result(true);
exit;
end;
end; // for k
end; //for j
end; //for i
make_empty_pack_selection(result);
end;//for the case num_count < 16
function t_pack_searcher.internal_exec_smallcase: t_pack_selection;
var
i, j: integer;
bits: integer;
value_i: cardinal;
begin
prepare_prec_8_arr(0, 0); if num_count > 8 then
prepare_prec_8_arr(8, 1)
else prepare_dummy_prec_8_arr(1); bits := (1 shl num_count) - 1; for i := t_bits(bits)[1] downto 0 do
begin
value_i := prec_8_arr[1][i];
for j := t_bits(bits)[0] downto 0 do
begin
if (value_i + prec_8_arr[0][j]) = sum then
begin
setlength(result, selection_sz);
result[0] := j;
if selection_sz > 1 then
result[1] := i;
exit;
end;
end;
end; make_empty_pack_selection(result);
end;procedure t_pack_searcher.make_empty_pack_selection(
var selection: t_pack_selection);
begin
if length(selection) <> selection_sz then
setlength(selection, selection_sz); fillchar(
selection[0],
sizeof(selection[0]) * selection_sz,
0
);
end;procedure t_pack_searcher.map_to_original_order(
var selection: t_pack_selection);
var
b: byte;
i, j, bit_index_base, org_index: integer;
temp: t_pack_selection;
begin
if not require_order_map then
exit; setlength(temp, length(selection)); for i := low(selection) to high(selection) do
begin
b := selection[i];
bit_index_base := i * 8;
for j := 0 to bits_per_byte - 1 do
begin
if (b and bits[j]) <> 0 then
begin
org_index := nums_order_map[bit_index_base + j];
set_bit(@temp[0], org_index);
end;
end;
end;
move(
temp[0],
selection[0],
sizeof(temp[0]) * length(temp)
);
end;procedure t_pack_searcher.prepare_dummy_prec_8_arr(
const prec_8_arr_index: integer);
begin
fillchar(
prec_8_arr[prec_8_arr_index],
sizeof(prec_8_arr[prec_8_arr_index]),
0
);
end;
var
prec_results_temp_lo: t_prec_result_list256; //size=$100 * 8
prec_results_temp_hi: t_prec_result_list256; //size=$100 * 8;
//required stack size is $1 000 i, j: cardinal;
iter_lo, iter_hi: integer;
p_lo, p_hi, p_dest: p_prec_result; prec_result_list_temp,
prec_result_list_src,
prec_result_list_dst,
swap: p_prec_result_list; dest_is_temp: boolean;
begin
//对num_copy进行排序
sort_num_copies();
prec_result_list_sz := length(prec_result_list); //现在, nums[0] <= nums[1] <= nums[2] ... <= nums[15]
//所以我们接下来的calc_prec_results_temp()调用的性能应该是很好的 //对 nums[0.. 7] 产生所有的组合的 sum 到 prec_results_temp_lo,
//对 nums[8..15] 产生所有的组合的 sum 到 prec_results_temp_hi,
//并对两个结果数组进行排序
//开销 = $100 * log($100) * 2
calc_prec_result_list256(@nums_copy[0], @prec_results_temp_lo); //quick sort
{$ifdef debug}
check_prec_result_bitmap(
nums_copy,
@prec_results_temp_lo[0],
length(prec_results_temp_lo)
);
{$endif}
calc_prec_result_list256(@nums_copy[8], @prec_results_temp_hi); //quick sort
{$ifdef debug}
check_prec_result_bitmap(
copy_nums(nums_copy, 8, 8),
@prec_results_temp_hi[0],
length(prec_results_temp_hi)
);
{$endif} //至此,有
//with prec_results_temp_lo:
// [0].value <= [1].value <= [2].value ... <= [255].value
//对prec_results_temp_hi亦有:
// [0].value <= [1].value <= [2].value ... <= [255].value getmem(prec_result_list_temp, sizeof(prec_result_list_temp^));
try
//接下来,我们再从这两个256个可能的值中组合产生65536个16位值
p_dest := @prec_result_list[0]; //结果放在这个大数组中 (*****************************************
* TODO:
* use xmm to accelerate the addition
* operation
*****************************************)
for iter_hi := low(prec_results_temp_hi) to high(prec_results_temp_hi) do
begin
p_hi := @prec_results_temp_hi[iter_hi]; for iter_lo := low(prec_results_temp_lo) to high(prec_results_temp_lo) do
begin
p_lo := @prec_results_temp_lo[iter_lo]; p_dest^.value := p_hi^.value + p_lo^.value;
p_dest^.bitmap := (p_hi^.bitmap shl 8) + p_lo^.bitmap; {$ifdef debug}
check_bitmap(nums_copy, p_dest^.bitmap, p_dest^.value);
{$endif} inc(p_dest);
end;
end; //因为前面的prec_results_temp_lo和prec_results_temp_hi都是已经有序的
//所以
// for i := 0 to 255-1,恒有
// prec_result_list[i] <= prec_result_list[i+1]
//
// for i := 256 to 511-1, 有
// prec_result_list[i] <= prec_result_list[i+1]
//
//依次类推,整个prec_result_list以256个记录为组分组,都有这些规律
//
//那么,可以在这个基础上进行归并排序,每相邻的两个分组作一次归并,得到一个更大
//的512数组,再归并,直到最后,两个32768个记录的数组归并成一个65536个记录
//的数组,排序完毕
//也可以对折归并,对折时,左边的数据比右边数据小的情况比较多,merge过程的比
//较次数就会相对减少,所以对折归并比较有利 j := $200; //最开始,每次合并的覆盖面为两个$100
prec_result_list_src := @prec_result_list;
prec_result_list_dst := prec_result_list_temp; //第1次: merge的结果在prec_result_list_temp dest_is_temp := true; {
对折归并
以下j循环的总共开销为
8 * $10 000 = $80 000 次 (比较+存储)
+
8 * 128 = 1024 次 (函数调用)
}
while j <= $10000 do //合并的覆盖面小于等于$10000, 总共循环8次
begin
{
以下i循环开销为
($10000 div 2 div (j div 2)) * (j/2 + j/2) = $10000
}
i := 0;
while i < $10000 div 2 do
begin
merge(
@prec_result_list_src^[i],
@prec_result_list_src^[i+cardinal($10000 div 2)],
@prec_result_list_dst^[i*2],
j div 2
);
inc(i, j div 2);
end;// //毗邻的数组两两归并
// i := 0;
// while i < $10000 do
// begin
// merge(
// @prec_result_list_src^[i],
// @prec_result_list_src^[i+j div 2],
// @prec_result_list_dst^[i],
// j div 2
// );
// inc(i, j);
// end; j := j * 2; //覆盖面增大1倍 //交换源/目标数组
swap := prec_result_list_src;
prec_result_list_src := prec_result_list_dst;
prec_result_list_dst := swap;
if j <= $10000 then
dest_is_temp := not dest_is_temp;
end; if dest_is_temp then
move(
prec_result_list_temp^,
prec_result_list,
sizeof(prec_result_list)
);
//第1次: merge的结果在: in prec_result_list_temp
//2nd: prec_result_list
//3: prec_result_list_temp
//4: prec_result_list
//5: prec_result_list_temp
//6: prec_result_list
//7: prec_result_list_temp
//8: prec_result_list <<最后一次 //所以,至此需要的结果已经准备好在prec_result_list了 //释放prec_result_list_temp
finally
freemem(prec_result_list_temp);
end; {$ifdef debug}
//验证结果数组的次序性
verify_order_le(@prec_result_list[0], length(prec_result_list));
//验证结果数组的位组合是否与他的和相对应
check_prec_result_bitmap(nums_copy, @prec_result_list[0], length(prec_result_list));
{$endif}
end;procedure t_pack_searcher.prepare_middle;
begin
prec_result_list_sz := $100; calc_prec_result_list256(@nums_copy, @prec_result_list);
end;{
pre-calculate 8 nums all posible sum of 8 nums combination
nums_copy_index: the first num index of nums_copy
prec_8_arr_index: the result sum will put in prec_8_arr[prec_8_arr_index][]
}
procedure t_pack_searcher.prepare_prec_8_arr(
const nums_copy_index: integer;
const prec_8_arr_index: integer
);
var
i, j: integer;
temp: cardinal;
begin
for i := low(byte) to high(byte) do
begin
temp := 0;
for j := 0 to bits_per_byte-1 do
if (i and bits[j]) <> 0 then
inc(temp, nums_copy[nums_copy_index + j]);
prec_8_arr[prec_8_arr_index][i] := temp;
end;
end;
//二分查找
function t_pack_searcher.scan2(const base_value: cardinal): cardinal;
var
i, min_index, max_index: integer;
temp: cardinal;
begin
min_index := 0;
max_index := prec_result_list_sz - 1; i := max_index shr 1;
while true do
begin
temp := prec_result_list[i].value - base_value;
{$ifdef trace}
inc(scan_hit_count, 1);
{$endif} if temp = 0 then
begin
result := prec_result_list[i].bitmap;
exit;
end
else if integer(temp) < 0 then
begin
if i = max_index then
break; min_index := i + 1;
i := ((i + 1) + max_index) shr 1;
end
else // if temp > 0 then
begin
if i = min_index then
break; max_index := i - 1;
i := (min_index + (i - 1)) shr 1;
end;
end; result := scan_not_found; //not found
end;{
对nums_copy进行排序,并且记录他们原来的次序
}
procedure t_pack_searcher.sort_num_copies;
var
i: integer;
sort_temp: array[0..39] of t_prec_result;
pp: array[0..39] of p_prec_result;
begin
for i := 0 to num_count-1 do
begin
sort_temp[i].value := nums_copy[i];
sort_temp[i].bitmap := i;
pp[i] := @sort_temp[i];
end; quicksort(
pointer(@pp),
0,
num_count-1,
prec_result_sort_compare_asc
); for i := 0 to num_count-1 do
begin
nums_copy[i] := pp[i]^.value;
nums_order_map[i] := pp[i]^.bitmap;
end;
require_order_map := true;
end;class procedure t_pack_searcher.test_scan_proc;
var
i: integer;
obj: t_pack_searcher;
begin
obj := t_pack_searcher(t_pack_searcher.newinstance());
try
for i := low(obj.prec_result_list) to high(obj.prec_result_list) do
begin
obj.prec_result_list[i].value := i;
obj.prec_result_list[i].bitmap := i;
end; obj.prec_result_list_sz := length(obj.prec_result_list); for i := low(obj.prec_result_list) to high(obj.prec_result_list) do
begin
obj.sum := i;
if obj.scan2(i) = scan_not_found then
raise exception.create('error');
end;
finally
obj.destroy();
end;end;
procedure print_byte_bitmap(const b: byte);
var
i: integer;
begin
for i := 0 to bits_per_byte-1 do
begin
if (b and bits[i]) <> 0 then
write('1')
else write('0');
end; write(' ');
end; procedure print_selection(const selection: t_pack_selection);
var
i: integer;
begin
for i := low(selection) to high(selection) do
print_byte_bitmap(selection[i]);
end;
t_bitmap_64 = record
lo, hi: cardinal;
end;
{ t_data_sample }
constructor t_data_sample.create(
const a_no_result: boolean;
const a_num_count: cardinal;
const a_min_selection_count: integer);var
i, rr_lo, rr_hi, bit_count, selection_sz: integer;
bitmask_lo, bitmask_hi: cardinal;
bitmap: t_bitmap_64;
sum64, max_sum: int64;
begin
assert((a_num_count > 0) and (a_num_count <= 40)); no_result := a_no_result;
num_count := a_num_count;
min_selection_count := a_min_selection_count; randomize(); setlength(nums, a_num_count); //the allowed sum is high(cardinal)-1
rr_hi := maxint div integer(a_num_count);
rr_lo := rr_hi div 4; for i := low(nums) to high(nums) do
nums[i] := randomrange(rr_lo, rr_hi); bitmap.lo := 0;
bitmap.hi := 0; if not no_result then
begin
max_sum := high(cardinal);
max_sum := max_sum - 1; if num_count >= 32 then
bitmask_lo := (not cardinal(0))
else
bitmask_lo := (not cardinal(0)) shr (32 - num_count); if num_count > 32 then
bitmask_hi := (not cardinal(0)) shr (8 - (num_count-32)) and $ff
else bitmask_hi := 0; repeat
if min_selection_count < 31 then
bitmap.lo := bitmap.lo or randomrange(1, maxint) and bitmask_lo
else bitmap.lo := bitmask_lo; if min_selection_count > 32 then
begin
if min_selection_count < 40 then
bitmap.hi := bitmap.hi or randomrange(1, maxint) and bitmask_hi
else bitmap.hi := bitmask_hi;
end; sum64 := 0;
bit_count := 0;
for i := low(nums) to high(nums) do
begin
if test_bit(@bitmap, i) then
begin
inc(sum64, nums[i]);
inc(bit_count);
end;
end;
until ((sum64 < max_sum) and (bit_count >= min_selection_count)); sum := sum64;
end
else
//no result case
sum := high(cardinal); selection_sz := calc_selection_size(num_count);
setlength(init_selection, selection_sz);
for i := low(nums) to high(nums) do
if test_bit(@bitmap, i) then
set_bit(@init_selection[0], i);
end;
procedure t_data_sample.exec(const print_found_selection: boolean);
var
searcher: t_pack_searcher;
selection: t_pack_selection;
begin
QueryPerformanceCounter(perf_cnt_begin);
searcher := t_pack_searcher.create(no_result, sum, nums);
try
selection := searcher.exec();
finally
searcher.destroy();
end;
QueryPerformanceCounter(perf_cnt_end); if verify(selection, false) and print_found_selection then
begin
write('found selection: '); print_selection(selection); writeln('');
end;
end;function t_data_sample.perf_cnt_period_to_ms: double;
var
freq: tlargeinteger;
begin
QueryPerformanceFrequency(freq);
result := (perf_cnt_end - perf_cnt_begin) * 1000;
result := result / freq;
end;function t_data_sample.verify(
const selection: t_pack_selection;
const print_nums: boolean): boolean;
var
i, selected_count: integer;
sum_verify: cardinal; procedure print_error();
begin
writeln('******************************');
writeln('ERROR: ');
writeln('******************************');
end;begin
if is_empty_pack_selection(selection) then
begin
if no_result then
begin
result := true;
exit;
end
else
begin
writeln('original sum is :', sum);
write('original selection is: '); print_selection(init_selection); writeln('');
print_error();
pause();
abort();
end;
end; sum_verify := 0;
selected_count := 0;
for i := low(nums) to high(nums) do
begin
if test_selection(selection, i) then
begin
inc(sum_verify, nums[i]);
inc(selected_count); if print_nums then
writeln('select ', i:2, 'th num: ', nums[i]:12);
end;
end;
if print_nums then
writeln('selected count: ', selected_count); result := sum_verify = sum;
if not result then
begin
print_error();
writeln('the selected sum is :', sum_verify);
writeln('original sum is :', sum);
write('original selection is: '); print_selection(init_selection); writeln('');
write('found selection is: '); print_selection(selection); writeln('');
pause();
abort();
end;
end;
t_perf_count_collection = record
expected_times, tested_times: integer;
used_times: array of double;
end;
p_perf_count_collection = ^t_perf_count_collection;{ t_pack_tester }
class procedure t_pack_tester.full_test;
procedure _exec_(
const a_no_result: boolean;
const a_num_count: cardinal;
const a_min_selection_count: integer;
perf_cnt_collect: p_perf_count_collection;
const a_indent: integer
);
var
sample: t_data_sample;
begin
sample := t_data_sample.create(
a_no_result,
a_num_count,
a_min_selection_count
);
try
if a_indent > 0 then
indent(a_indent);
sample.exec(perf_cnt_collect <> nil); if perf_cnt_collect <> nil then
begin
if perf_cnt_collect^.tested_times < perf_cnt_collect^.expected_times then
begin
perf_cnt_collect^.used_times[perf_cnt_collect^.tested_times] :=
sample.perf_cnt_period_to_ms(); inc(perf_cnt_collect^.tested_times);
end;
end;
finally
sample.destroy();
end;
end; procedure do_test(
const no_result: boolean;
const num_count: cardinal;
perf_cnt_collect: p_perf_count_collection;
const indent: integer
);
begin
if no_result then
_exec_(true, num_count, 0, perf_cnt_collect, indent)
else
begin
if perf_cnt_collect = nil then
begin
_exec_(false, num_count, 1, perf_cnt_collect, indent);
if num_count > 1 then
_exec_(false, num_count, num_count, nil, indent);
if num_count > 2 then
_exec_(false, num_count, num_count-1, nil, indent);
end; if num_count > 5 then
_exec_(false, num_count, num_count div 2, perf_cnt_collect, indent); if num_count > 7 then
begin
_exec_(false, num_count, num_count div 2 - 1, perf_cnt_collect, indent);
_exec_(false, num_count, num_count div 2 + 1, perf_cnt_collect, indent);
end; if num_count > 9 then
begin
_exec_(false, num_count, num_count div 2 - 2, perf_cnt_collect, indent);
_exec_(false, num_count, num_count div 2 + 2, perf_cnt_collect, indent);
end;
end;
end; procedure init_pcc(const expected_times: integer; var pcc: t_perf_count_collection);
begin
pcc.expected_times := expected_times;
pcc.tested_times := 0;
if length(pcc.used_times) <> expected_times then
setlength(pcc.used_times, expected_times);
end; procedure avg_of_pcc(
const pcc: t_perf_count_collection;
var max, min, avg: double);
var
i: integer;
t: double;
begin
max := 0;
min := MaxInt;
avg := 0;
for i := 0 to pcc.tested_times - 1 do
begin
t := pcc.used_times[i];
avg := avg + t;
if t > max then
max := t;
if t < min then
min := t;
end; avg := avg / pcc.tested_times;
end; function bool2str(b: boolean): string;
const b_str: array[boolean] of string = ('false', 'true');
begin
result := b_str[b];
end; procedure print_task_title(const title: string);
begin
writeln('######################');
writeln('TASK: ', title);
writeln('######################');
end; procedure test_correctness();
var
i: integer; begin
print_task_title('test correctness');
for i := 1 to max_num_count do
begin
indent(2); write('num count: ', i:2, ', no result = ', bool2str(true));
do_test(true, i, nil, 0);
do_test(true, i, nil, 0);
do_test(true, i, nil, 0);
do_test(true, i, nil, 0);
writeln(' OK'); indent(2); write('num count: ', i:2, ', no result = ', bool2str(false));
do_test(false, i, nil, 0);
do_test(false, i, nil, 0);
do_test(false, i, nil, 0);
do_test(false, i, nil, 0);
writeln(' OK');
end;
writeln('');
end; procedure test_performance(); procedure _p_(
const no_result: boolean;
const num_count: integer);
const
times = 4;
var
i: integer;
pcc: t_perf_count_collection;
max, min, avg: double;
begin
init_pcc(times * 32, pcc); indent(4); writeln('no result = ', bool2str(no_result)); for i := 1 to times do
begin
do_test(no_result, num_count, @pcc, 6);
if no_result then
do_test(no_result, num_count, @pcc, 6);
end; avg_of_pcc(pcc, max, min, avg); indent(6); writeln('tested ', pcc.tested_times, ' pass(es)');
indent(6); writeln('max used time(ms):', formatfloat('0.0000', max));
indent(6); writeln('min used time(ms):', formatfloat('0.0000', min));
indent(6); writeln('avg used time(ms):', formatfloat('0.0000', avg));
writeln('');
end; procedure p(const num_count: integer);
begin
indent(2); writeln('num count = ', num_count:2);
indent(2); writeln('---------------------');
_p_(true, num_count);
_p_(false, num_count);
writeln('');
end; var
i: integer;
begin
print_task_title('test performance');
i := 8;
while i <= max_num_count do
begin
p(i-1);
p(i);
inc(i, 8);
end;
end;
begin
writeln('+=======================================+');
writeln(' pack FULL TEST');
writeln('+=======================================+'); t_pack_searcher.test_scan_proc(); test_correctness();
{$ifdef perf_meas}
test_performance();
{$endif} writeln('*****************************************');
writeln('');
writeln('+=======================================+');
writeln(' FINISH');
writeln('+=======================================+');
writeln('');
writeln('');
pause();
end;
{$endif}end.
时间复杂度基本上从 O(n^2) 降到了 O(log2 N) 了