var myl,mym:TStringList; mys:string; myi:integer; begin myl:=TStringList.create; mym:=TStringList.create; myl.LoadFromFile('input.txt'); myl.Sort; mys:=''; for myi:=0 to myl.Count-1 do if mys<>myl.Strings[myi] then begin mym.Add(mys); mys:=myl.Strings[myi]; end; mym.SaveToFile('out.txt'); myl.Free; mym.Free; end;这个方法效率如何?
加上一行Sort,可以提高不少速度。 关于TStringList类的Sort和IndexOf函数,Delphi中提供了源代码的。其中,Sort使用的是快速排序,IndexOf在有Sort的情况下为二分法查找,否则为顺序查找。 效率应该不低的。procedure TForm1.Button1Click(Sender: TObject); var AInput : TStringList; AOutput : TStringList; iLoop : Integer; sTemp : String; begin AInput := TStringList.Create; try AInput.LoadFromFile('C:\Input.TXT'); AInput.Sort; AOutput := TStringList.Create; try for iLoop := 0 to AInput.Count - 1 do begin sTemp := AInput.Strings[iLoop]; if AOutput.IndexOf(sTemp) < 0 then AOutput.Add(sTemp); end; AOutput.SaveToFile('C:\Outpt.TXT'); finally AOutput.Free; end; finally AInput.Free; end; end;
unit uCheckDup;interfaceuses Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Dialogs, StdCtrls;procedure StartCheckDup; function CheckDup(AStr: string): boolean;implementationvar StrListArray: array of TStringList;const BufSize = 65536;// 64Kprocedure StartCheckDup; var I: integer; begin SetLength(StrListArray, BufSize); for I := 0 to BufSize - 1 do StrListArray[I].Clear; end;function CheckDup(AStr: string): boolean; type TWordArray = array of word; var Key: word; I, L: integer; AStrList: TStringList; begin Key := 0; L := length(AStr); if L = 1 then Key := Ord(AStr[1]) else for I := (L shr 1) - 1 downto 0 do Key := Key + TWordArray(PChar(AStr))[I]; if (L and 1) <> 0 then Key := Key + Ord(AStr[L]); AStrList := StrListArray[Key]; if (AStrList.Count = 0) or (AStrList.IndexOf(AStr) < 0) then begin AStrList.Append(AStr); Result := False; end else Result := True; end;procedure GenerateArray; var I: integer; begin SetLength(StrListArray, BufSize); for I := 0 to BufSize - 1 do StrListArray[I] := TStringList.Create; end;procedure FreeArray; var I: integer; begin for I := 0 to BufSize - 1 do FreeAndNil(StrListArray[I]); end;initialization GenerateArray; finalization FreeArray; end.
使用方法:procedure TForm1.Button1Click(Sender: TObject); var ATick: DWord; I: integer; begin ATick := GetTickCount; StartCheckDup; sl2.Clear; for I := 0 to sl.Count-1 do begin if not CheckDup(sl[I]) then sl2.Append(sl[I]); Caption := IntToStr(I); end; ShowMessage('Time:' + IntToStr(GetTickCount - ATick) + 'ms,Remains:' + IntToStr(sl2.Count)); end;
//十万行,双核1.6G耗时1.2 秒var I: Integer; vTickCount: Longword; begin Randomize; // test with TStringList.Create do try //LoadFromFile('input.txt'); //载入文件 for I := 1 to 100000 do Add(IntToStr(Random(MaxInt))); // 产生十万行文本 vTickCount := GetTickCount; Sort; //排序 for I := Count - 1 downto 0 do if (I >= 1) and (Strings[I] = Strings[I - 1]) then Delete(I); Caption := IntToStr(GetTickCount - vTickCount); // 输出用时 finally Free; end; end;
应该这样,规定字符串的长度为某个固定值比较好。我推荐20。测试数据: var I: integer; begin Randomize; sl.BeginUpdate; sl.Clear; for I := 0 to 100000 do begin sl.Append('test' + format('%.11d', [random(50000)])); Caption := IntToStr(I); end; sl.Sort; sl.EndUpdate;
(不知道重复率有多大?)利用Hash值来进行快速排出,算法复杂度接近O(N):
1.建立一个TNode数组(大小与不重复的数据量相当,或大些也可以) TNode=record Str:Pchar PNext : ^TNode end; (单向链表)
2.计算每个数据S的Hash值,映射到数组元素I, 如果I.PChar=nil,则I.PChar=s; 否则对比这个单向链表的所有字符串值,看S是否已经有了,没有的话添加到I的最后面;
var
AInput : TStringList;
AOutput : TStringList;
iLoop : Integer;
sTemp : String;
begin
AInput := TStringList.Create;
try
AInput.LoadFromFile('C:\Input.TXT');
AOutput := TStringList.Create;
try
for iLoop := 0 to AInput.Count - 1 do
begin
sTemp := AInput.Strings[iLoop];
if AOutput.IndexOf(sTemp) < 0 then AOutput.Add(sTemp);
end;
AOutput.SaveToFile('C:\Outpt.TXT');
finally
AOutput.Free;
end;
finally
AInput.Free;
end;
end;
myl,mym:TStringList;
mys:string;
myi:integer;
begin
myl:=TStringList.create;
mym:=TStringList.create;
myl.LoadFromFile('input.txt');
myl.Sort;
mys:='';
for myi:=0 to myl.Count-1 do
if mys<>myl.Strings[myi] then
begin
mym.Add(mys);
mys:=myl.Strings[myi];
end;
mym.SaveToFile('out.txt');
myl.Free;
mym.Free;
end;这个方法效率如何?
用存盘的方法(建个临时目录),把一行数据当成一个文件名,放硬盘上存,如果重名了就不存,这样只从头到尾走一次就够了,执行完后,在用dir命令存成一个文件就成了!
关于TStringList类的Sort和IndexOf函数,Delphi中提供了源代码的。其中,Sort使用的是快速排序,IndexOf在有Sort的情况下为二分法查找,否则为顺序查找。
效率应该不低的。procedure TForm1.Button1Click(Sender: TObject);
var
AInput : TStringList;
AOutput : TStringList;
iLoop : Integer;
sTemp : String;
begin
AInput := TStringList.Create;
try
AInput.LoadFromFile('C:\Input.TXT');
AInput.Sort;
AOutput := TStringList.Create;
try
for iLoop := 0 to AInput.Count - 1 do
begin
sTemp := AInput.Strings[iLoop];
if AOutput.IndexOf(sTemp) < 0 then AOutput.Add(sTemp);
end;
AOutput.SaveToFile('C:\Outpt.TXT');
finally
AOutput.Free;
end;
finally
AInput.Free;
end;
end;
Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls,
Dialogs, StdCtrls;procedure StartCheckDup;
function CheckDup(AStr: string): boolean;implementationvar
StrListArray: array of TStringList;const
BufSize = 65536;// 64Kprocedure StartCheckDup;
var
I: integer;
begin
SetLength(StrListArray, BufSize);
for I := 0 to BufSize - 1 do
StrListArray[I].Clear;
end;function CheckDup(AStr: string): boolean;
type
TWordArray = array of word;
var
Key: word;
I, L: integer;
AStrList: TStringList;
begin
Key := 0;
L := length(AStr);
if L = 1 then
Key := Ord(AStr[1])
else
for I := (L shr 1) - 1 downto 0 do
Key := Key + TWordArray(PChar(AStr))[I]; if (L and 1) <> 0 then
Key := Key + Ord(AStr[L]); AStrList := StrListArray[Key];
if (AStrList.Count = 0) or (AStrList.IndexOf(AStr) < 0) then
begin
AStrList.Append(AStr);
Result := False;
end
else
Result := True;
end;procedure GenerateArray;
var
I: integer;
begin
SetLength(StrListArray, BufSize);
for I := 0 to BufSize - 1 do
StrListArray[I] := TStringList.Create;
end;procedure FreeArray;
var
I: integer;
begin
for I := 0 to BufSize - 1 do
FreeAndNil(StrListArray[I]);
end;initialization
GenerateArray;
finalization
FreeArray;
end.
var
ATick: DWord;
I: integer;
begin
ATick := GetTickCount;
StartCheckDup;
sl2.Clear;
for I := 0 to sl.Count-1 do
begin
if not CheckDup(sl[I]) then
sl2.Append(sl[I]);
Caption := IntToStr(I);
end;
ShowMessage('Time:' + IntToStr(GetTickCount - ATick)
+ 'ms,Remains:' + IntToStr(sl2.Count));
end;
I: Integer;
vTickCount: Longword;
begin
Randomize; // test
with TStringList.Create do try
//LoadFromFile('input.txt'); //载入文件
for I := 1 to 100000 do Add(IntToStr(Random(MaxInt))); // 产生十万行文本 vTickCount := GetTickCount;
Sort; //排序 for I := Count - 1 downto 0 do
if (I >= 1) and (Strings[I] = Strings[I - 1]) then
Delete(I);
Caption := IntToStr(GetTickCount - vTickCount); // 输出用时
finally
Free;
end;
end;
var
I: integer;
begin
Randomize;
sl.BeginUpdate;
sl.Clear;
for I := 0 to 100000 do
begin
sl.Append('test' + format('%.11d', [random(50000)]));
Caption := IntToStr(I);
end;
sl.Sort;
sl.EndUpdate;
搂主感谢我吧。注意:本次算法函数返回值调整了一下,无重复返回true,和以前相反。
unit uCheckDup;interfaceuses
Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls,
Dialogs, StdCtrls;procedure StartCheckDup;
function CheckDup(AStr: string): boolean;implementationconst
BufSize = 65536;// 64Kvar
StrListArray: array of TStringList;
Crc16Tab: array[0..$FF] of word =
($00000, $01021, $02042, $03063, $04084, $050a5, $060c6, $070e7,
$08108, $09129, $0a14a, $0b16b, $0c18c, $0d1ad, $0e1ce, $0f1ef,
$01231, $00210, $03273, $02252, $052b5, $04294, $072f7, $062d6,
$09339, $08318, $0b37b, $0a35a, $0d3bd, $0c39c, $0f3ff, $0e3de,
$02462, $03443, $00420, $01401, $064e6, $074c7, $044a4, $05485,
$0a56a, $0b54b, $08528, $09509, $0e5ee, $0f5cf, $0c5ac, $0d58d,
$03653, $02672, $01611, $00630, $076d7, $066f6, $05695, $046b4,
$0b75b, $0a77a, $09719, $08738, $0f7df, $0e7fe, $0d79d, $0c7bc,
$048c4, $058e5, $06886, $078a7, $00840, $01861, $02802, $03823,
$0c9cc, $0d9ed, $0e98e, $0f9af, $08948, $09969, $0a90a, $0b92b,
$05af5, $04ad4, $07ab7, $06a96, $01a71, $00a50, $03a33, $02a12,
$0dbfd, $0cbdc, $0fbbf, $0eb9e, $09b79, $08b58, $0bb3b, $0ab1a,
$06ca6, $07c87, $04ce4, $05cc5, $02c22, $03c03, $00c60, $01c41,
$0edae, $0fd8f, $0cdec, $0ddcd, $0ad2a, $0bd0b, $08d68, $09d49,
$07e97, $06eb6, $05ed5, $04ef4, $03e13, $02e32, $01e51, $00e70,
$0ff9f, $0efbe, $0dfdd, $0cffc, $0bf1b, $0af3a, $09f59, $08f78,
$09188, $081a9, $0b1ca, $0a1eb, $0d10c, $0c12d, $0f14e, $0e16f,
$01080, $000a1, $030c2, $020e3, $05004, $04025, $07046, $06067,
$083b9, $09398, $0a3fb, $0b3da, $0c33d, $0d31c, $0e37f, $0f35e,
$002b1, $01290, $022f3, $032d2, $04235, $05214, $06277, $07256,
$0b5ea, $0a5cb, $095a8, $08589, $0f56e, $0e54f, $0d52c, $0c50d,
$034e2, $024c3, $014a0, $00481, $07466, $06447, $05424, $04405,
$0a7db, $0b7fa, $08799, $097b8, $0e75f, $0f77e, $0c71d, $0d73c,
$026d3, $036f2, $00691, $016b0, $06657, $07676, $04615, $05634,
$0d94c, $0c96d, $0f90e, $0e92f, $099c8, $089e9, $0b98a, $0a9ab,
$05844, $04865, $07806, $06827, $018c0, $008e1, $03882, $028a3,
$0cb7d, $0db5c, $0eb3f, $0fb1e, $08bf9, $09bd8, $0abbb, $0bb9a,
$04a75, $05a54, $06a37, $07a16, $00af1, $01ad0, $02ab3, $03a92,
$0fd2e, $0ed0f, $0dd6c, $0cd4d, $0bdaa, $0ad8b, $09de8, $08dc9,
$07c26, $06c07, $05c64, $04c45, $03ca2, $02c83, $01ce0, $00cc1,
$0ef1f, $0ff3e, $0cf5d, $0df7c, $0af9b, $0bfba, $08fd9, $09ff8,
$06e17, $07e36, $04e55, $05e74, $02e93, $03eb2, $00ed1, $01ef0);function CRCValue(AStr: string): Word;
var
i: integer;
begin
Result := 0;
for i := Length(AStr) downto 1 do
Result := Hi(Result) xor CRC16Tab[byte(AStr[i]) xor Lo(Result)];
end;procedure StartCheckDup;
var
I: integer;
begin
SetLength(StrListArray, BufSize);
for I := 0 to BufSize - 1 do
StrListArray[I].Clear;
end;function CheckDup(AStr: string): boolean;
begin
with StrListArray[CRCValue(AStr)] do
begin
Result := (Count = 0) or (IndexOf(AStr) < 0);
if Result then
Append(AStr);
end;
end;procedure GenerateArray;
var
I: integer;
begin
SetLength(StrListArray, BufSize);
for I := 0 to BufSize - 1 do
StrListArray[I] := TStringList.Create;
end;procedure FreeArray;
var
I: integer;
begin
for I := 0 to BufSize - 1 do
FreeAndNil(StrListArray[I]);
end;initialization
GenerateArray;
finalization
FreeArray;
end.