我想在一个10M的文本文件中搜索指定字符串，请问该怎么做

用StrPos，不需要分配内存，就在原内存上找
看例子
var
  Buf : String;
  Key : String;
  FP  : PChar;
begin
  Buf := '';//改成你的读取过程
  Key := '查询字符串';
  //首先得判断一下查询和被查询字符串是否为空
  FP := StrPos(@Buf[1],@Key[1]);
  .......
end;

//楼主说错了吧,应该是KMP才对,这种算法最适应查询外部大容量存储信息,我写了个算法如下,可以实现文件内容高速查询,希望能为楼主带来帮助
unit Unit1;interfaceuses
  Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
  Dialogs, StdCtrls, StrUtils;type
  TForm1 = class(TForm)
    Button1: TButton;
    Memo1: TMemo;
    procedure Button1Click(Sender: TObject);
  private
    { Private declarations }
  public
    { Public declarations }
  end;var
  Form1: TForm1;implementation{$R *.dfm}
function KMP(tent,path:String;StartPos:Integer=1;EndPos:Integer=-1):Integer;
var
  f:array of Integer;
  t,i,j,p:Integer;
  lent:Integer;
  lenp:Integer;
begin
  if EndPos>0 then
    Lent:=EndPos
  else
    lent:=Length(tent);
  lenp:=Length(path);
  SetLength(f,lenp+1);
  f[1]:=0;
  for j:=2 to lenp do
  begin
    i:=f[j-1];
    while (path[j]<>path[i+1])and (i>0) do i:=f[i];
    if path[j]=path[i+1] then
      f[j]:=i+1
    else
      f[j]:=0;
  end;
  t:=StartPos;
  p:=1;
  while (t<=lent) and (p<=lenp) do
  begin
    if tent[t]=path[p] then
    begin
      Inc(t);
      Inc(p);
    end
    else if p=1 then
       Inc(t)
    else p:=f[p-1]+1;
  end;
  if p<lenp then
    Result:=-1
  else
    Result:=t-lenp;
end;function FindStrInFile(Str,FileName:String;var Positions:Array of Int64):Integer;
var
  FS:TFileStream;
  Buffer:String;
  i,ReadBegin,ReadSize,ReadLen:Integer;
  BeginPos:Int64;
  SLen:Integer;
begin
  Result:=0;
  SLen:=Length(Str);
  ReadBegin:=1;
  ReadSize:=SLen*100;
  BeginPos:=0;
  SetLength(Buffer,ReadSize*2);
  FS:=TFileStream.Create(FileName,fmOpenRead or fmShareDenyNone);  try
    while Result<High(Positions) do
    begin
      ReadLen:=FS.Read(Buffer[ReadBegin],ReadSize);
      if ReadLen>0 then
      begin
        i:=1;
        repeat
          i:=KMP(Buffer,Str,i,ReadBegin+ReadLen-1);
          if i>0 then
          begin
            Positions[Result]:=BeginPos+i;
            Inc(Result);
            Inc(i,SLen);
          end;
        until i<0;
        ReadSize:=SLen*99+1;
        if FS.Size-FS.Position<ReadSize then ReadSize:=FS.Size-FS.Position;
        ReadBegin:=100*SLen-ReadSize;
        Move(Buffer[ReadSize+1],Buffer[1],ReadBegin);
        BeginPos:=BeginPos+ReadSize;
      end
      else
        Break;
    end;
  finally
    FS.Free;
  end;
end;
procedure TForm1.Button1Click(Sender: TObject);
var
  ps:Array[0..100] of Int64;//搜索匹配窜的确位置序列
  i,n:Integer;
begin
  n:=FindStrInFile('xml',ExtractFilePath(Application.ExeName)+'QTDB.TXT',ps);
  for i:=0 to n-1 do
  begin
    Memo1.Lines.Add('第'+IntToStr(i)+'位置在:'+IntToStr(ps[i]));
  end;
end;end.

我的笔误,把KMP算法写成了"BMP"算法,KMP算法要源字符串作参数,很难想象源字符串有10M大小

好像BM算法比KMP算法快了很多，楼主去找找，我记得以前有过，不过不记得了。

BM算法我早试过了，但是不知如何读取10M的文件，难道要length(s)=10M吗

AnsiString的长度可以达到4G，如果你有那么多内存的话。4G>>10M

我感觉lz的头脑不大清楚。你题目说要在10M的文件里搜索一个str。显然，str是要在文件里匹配的模式串
然后你又在问题里说“如果用BMP算法，需要提供的参数有源字符串和目标字符串，但是源目标字符串有10M，恐怕不行”。为什么不行？KMP根本是不需要回溯的。而且，你就算用strpos，照样要把那10M从头到尾过一遍。那么，你说那话的原因很清楚，你把那个10M的文件当做模式串了。
下面的话证实了我的判断“KMP算法要源字符串作参数,很难想象源字符串有10M大小”所以，你完全可以用KMP，只是要先把概念弄明白。

晕倒，对于外部存储设备的大容量信息查询，使用KMP是最适宜的算法，它的前滚无重复匹配过程最适应外部信息的分段匹配，而且即使纯内部信息查询当信息量大时其时间复杂度也是最优的,在最坏的情况下是O(m)。再者几位所说的MP是指哪个算法？？

大家可以访问以下网站确认KMP算法
http://mhss.nease.net/string/KMP.html

我实际上是有一个大约200M的数据表要读，我想把内容导入到文本，分成10M一个的文本文件，再用KMP或BM算法搜索匹配的字符串，要在4秒内返回结果，反正我在数据库内用KMP和BM算法遍历搜索要用40秒以上，所以想把记录导入到文本文件试验一下，如果10M的速度可以的话，考虑分成50M一个的文件，大家有做过这类搜索吗

function FindPosWithSys(const ASource, ASub: string; StartPos: Integer): Integer;
begin
  // PosEx in D7.StrUtils
  Result := PosEx(ASub, ASource, StartPos);
end;
type
  TFindPos = function(const ASource, ASub: string; StartPos: Integer): Integer;procedure TForm1.CompareButtonClick(Sender: TObject);  function StringFromFile(const FileName: string): string;
  var
    FileSize: Integer;
  begin
    with TMemoryStream.Create do
    try
      LoadFromFile(FileName);
      FileSize := Size;
      SetLength(Result, FileSize);
      Move(Memory^, Result[1], FileSize);
    finally
      Free;
    end;
  end;

  procedure Search(const ASource, ASub, MethodName: string; SearchPos: TFindPos);
  var
    S1, E1: DWORD;
    S2, E2: Int64;
    SubLen, StartPos, SearchCount: Integer;
  begin
    S1 := GetTickCount;
    QueryPerformanceCounter(S2);
    StartPos := 1;
    SearchCount := 0;
    SubLen := Length(ASub);
    StartPos := SearchPos(ASource, ASub, StartPos);
    while StartPos <> 0 do
    begin
      Inc(SearchCount);
      Inc(StartPos, SubLen);
      StartPos := SearchPos(ASource, ASub, StartPos);
    end;
    QueryPerformanceCounter(E2);
    E1 := GetTickCount;
    MsgMemo.Lines.Add(Format('%20.20s GetTick Time: %10.10d, Query Counter: %10.10d, Search: %d',
      [MethodName, E1 - S1, E2 - S2, SearchCount]));
  end;var
  Source, Find: string;
begin
  Find := Trim(FindEdit.Text);
  if Find = '' then
    raise Exception.Create('嗯，找到N多，俺就不说了。');
  Source := StringFromFile(FileEdit.Text);
  try
    Search(Source, Find, 'FindPosWithPosEx', FindPosWithSys);
  finally
    SetLength(Source, 0);
  end;
end;

10M一般查找内容一般都在0.0N秒内好像（以前测的），即时你用的是PosEx, 费时的是read/write文件，所以，一般优化的操作是在读写文件，看你的用途了。谁将KMP算法写成:
TFindPos = function(const ASource, ASub: string; StartPos: Integer): Integer;
然后套到上面的
  try
    Search(Source, Find, 'FindPosWithKMP', FindPosWithKMP);
  finally
    SetLength(Source, 0);
  end;看看KMP跟PosEx快多少

假如考虑内存问题，没必要将200M文件分块，你可以使用我的那段程序作如下调整：
unit Unit1;interfaceuses
  Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
  Dialogs, StdCtrls, StrUtils;type
  TForm1 = class(TForm)
    Button1: TButton;
    Memo1: TMemo;
    procedure Button1Click(Sender: TObject);
  private
    { Private declarations }
  public
    { Public declarations }
  end;var
  Form1: TForm1;implementation{$R *.dfm}
function KMP(tent,path:String;StartPos:Integer=1;EndPos:Integer=-1):Integer;
var
  f:array of Integer;
  t,i,j,p:Integer;
  lent:Integer;
  lenp:Integer;
begin
  if EndPos>0 then
    Lent:=EndPos
  else
    lent:=Length(tent);
  lenp:=Length(path);
  SetLength(f,lenp+1);
  f[1]:=0;
  for j:=2 to lenp do
  begin
    i:=f[j-1];
    while (path[j]<>path[i+1])and (i>0) do i:=f[i];
    if path[j]=path[i+1] then
      f[j]:=i+1
    else
      f[j]:=0;
  end;
  t:=StartPos;
  p:=1;
  while (t<=lent) and (p<=lenp) do
  begin
    if tent[t]=path[p] then
    begin
      Inc(t);
      Inc(p);
    end
    else if p=1 then
       Inc(t)
    else p:=f[p-1]+1;
  end;
  if p<lenp then
    Result:=-1
  else
    Result:=t-lenp;
end;function FindStrInFile(Str,FileName:String;var Positions:Array of Int64):Integer;
const
  BUFFERBLOCK=10000;//调整这个可以加快文件读取进度
var
  FS:TFileStream;
  Buffer:String;
  i,ReadBegin,ReadSize,ReadLen:Integer;
  BeginPos:Int64;
  SLen:Integer;
begin
  Result:=0;
  SLen:=Length(Str);
  ReadBegin:=1;
  ReadSize:=SLen*BUFFERBLOCK;
  BeginPos:=0;
  SetLength(Buffer,ReadSize);
  FS:=TFileStream.Create(FileName,fmOpenRead or fmShareDenyNone);
  try
    while Result<High(Positions) do
    begin
      ReadLen:=FS.Read(Buffer[ReadBegin],ReadSize);
      if ReadLen>0 then
      begin
        i:=1;
        repeat
          i:=KMP(Buffer,Str,i,ReadBegin+ReadLen-1);
          if i>0 then
          begin
            Positions[Result]:=BeginPos+i;
            Inc(Result);
            Inc(i,SLen);
          end;
        until i<0;
        ReadSize:=(BUFFERBLOCK-1)*SLen+1;
        if FS.Size-FS.Position<ReadSize then ReadSize:=FS.Size-FS.Position;
        ReadBegin:=BUFFERBLOCK*SLen-ReadSize+1;
        Move(Buffer[ReadSize+1],Buffer[1],ReadBegin-1);
        BeginPos:=BeginPos+ReadSize;
      end
      else
        Break;
    end;
  finally
    FS.Free;
  end;
end;
procedure TForm1.Button1Click(Sender: TObject);
var
  ps:Array[0..100] of Int64;//搜索匹配窜的确位置序列
  i,n:Integer;
begin
  n:=FindStrInFile('xml',ExtractFilePath(Application.ExeName)+'QTDB.TXT',ps);
  for i:=0 to n-1 do
  begin
    Memo1.Lines.Add('第'+IntToStr(i)+'位置在:'+IntToStr(ps[i]));
  end;
end;end.

//楼上大哥看看PosEx的源代码，保证吓你一大跳
function PosEx(const SubStr, S: string; Offset: Cardinal = 1): Integer;
var
  I,X: Integer;
  Len, LenSubStr: Integer;
begin
  if Offset = 1 then
    Result := Pos(SubStr, S)
  else
  begin
    I := Offset;
    LenSubStr := Length(SubStr);
    Len := Length(S) - LenSubStr + 1;
    while I <= Len do
    begin
      if S[I] = SubStr[1] then
      begin
        X := 1;
        while (X < LenSubStr) and (S[I + X] = SubStr[X + 1]) do
          Inc(X);
        if (X = LenSubStr) then
        begin
          Result := I;
          exit;
        end;
      end;
      Inc(I);
    end;
    Result := 0;
  end;
end;

2 wizardqi
你将你的KMP写成: FindPos(const ASource, ASub: string; StartPos: Integer): Integer;格式，然后代入那上面的代码去比较一下。我只想比较一下，不管哪种写法。

而且PosEx简单，你没看到PosEx只有几个简单的变量，进行简单的比较，而没能其它函数中初始的过程，一般过程，专门的算法只是为了专门的用途，所以，俺平常用的还是它的多。

PosEx代码很正常啊，怎么吓我一跳了？
另：刚才改了你的KMP函数，还真是吓我一跳，丢到我的那函数中，好像死掉了，郁闷，还是你来吧。

那么，用BM好了。lz最后发的帖子终于比较清楚地说出了真实目的。不管你文件多大，关键是，你可以用MapFile来快速读文件，这样就不用考虑什么内存问题。这样，再配BM算法，应该比较快。我刚刚google了一下，KMP的平均查找时间不如BM。

function FindPosWithBM(const ASource, ASub: string; StartPos: Integer): Integer;
const
  MAX_CHAR = 256;
  SizeInt = SizeOf(Integer);
type
  PByteArr = ^TByteArr;
  TByteArr = array [0..MaxInt - 1] of Byte;
var
  Src, Sub: PByte;
  I, J, CurrPos, SubLen, SrcLen: Integer;
  Buffer: array [0..MAX_CHAR - 1] of Integer;
begin
  Result := 0;
  SubLen := Length(ASub);
  SrcLen := Length(ASource);
  if SubLen > SrcLen then Exit;  Sub := PByte(ASub);
  Src := PByte(ASource);
  for I := 0 to MAX_CHAR - 1 do
    Buffer[I] := SubLen;
  for I := 0 to SubLen - 2 do
    Buffer[PByteArr(Sub)^[I]] := SubLen - I - 1;  CurrPos := SubLen + StartPos - 2;
  while CurrPos < SrcLen do
  begin
    I := CurrPos;
    J := SubLen - 1;
    while (J >= 0) and ((PByteArr(Src)^[I] = PByteArr(Sub)^[J])) do
    begin
      Dec(J);
      Dec(I);
    end;
    if -1 = J then
    begin
      Result := CurrPos - SubLen + 1;
      break;
    end;
    Inc(CurrPos, Buffer[PByteArr(Src)^[CurrPos]]);
  end;
end;function KMP(const ASource, ASub: string; StartPos: Integer = 1):Integer;
var
  F: array of Integer;
  I, J, SrcLen, SubLen: Integer;
begin
  Result := 0;
  SubLen := Length(ASub);
  SetLength(f,SubLen + 1);
  SrcLen := Length(ASource);
  F[0] := 0;
  F[1] := 0;  I := 1; J := 0;
  FillChar(F[0], SizeOf(Integer) * (SubLen + 1), 0);
  while I < SubLen do
    if (J = 0) or (ASub[I] = ASub[J]) then
    begin
      Inc(I);
      Inc(J);
      F[I] := J;
    end else
      J := F[J];  J := 1;
  I := StartPos;
  while (I <= SrcLen) and (J <= SubLen) do
  begin
    if ASource[I] = ASub[J] then
    begin
      Inc(I);
      Inc(J);
    end
    else
    if J = 1 then
      Inc(I)
    else
      J := F[J - 1] + 1;
    if J > SubLen then
    begin
      Result := I - SubLen;
      break;
    end;
  end;
end;function FindPosWithKMP(const ASource, ASub: string; AStartPos: Integer): Integer;
begin
  Result := KMP(ASource, ASub, AStartPos);
end;

弱弱的问一下,MapFile怎么用,另外,BM和KMP各自在数据表中查找时BM比KMP快6秒左右,我想大概是因为我查找的是中文字符串的缘故吧,若是英文,大概就能比BMP快3到5倍,就像文献中说的一样,我之所以想用文本文件查找,主要是因为数据库查找不能令人满意,我的文件有45000多个,在sqlserver中存储,每一项都是一个文本文件的字符串,用BM和KMP查找,用时40多秒,我就想把这些项分别导入几个大的文本文件中,然后查找,在一个文本文件中找到后可以确定它对应数据库中的哪些项,然后再在数据库中查找相应的项,返回正确的文件名,我试验过,用sql语句查找7000项的内容是瞬间完成的,我不知道我的想法是否可行,或者大家有更好的想法

用下面这个函数，生成一个大概10M的文件，内容是楼主发贴的内容：
>>如果用BMP算法，需要提供的参数有源字符串和目标字符串，但是源目标字符串有10M，恐怕不
>>行，如果用strPos函数，则需要为这个文本文件buffer分配10M空间，然后再用一个pChar指针
>>指向它，但是给buffer分配10M的空间是否会出错呢，有人能指点一下吗用发贴内容生成的文件有10M，查找的内容是：buffer
测试的结果是：
FindPosWithBM GetTick Time: 0000000047, Query Counter: 0000194279, Search: 88318
FindPosWithSys GetTick Time: 0000000031, Query Counter: 0000125786, Search: 88318
FindPosWithKMP GetTick Time: 0000000125, Query Counter: 0000474927, Search: 88318FindPosWithBM GetTick Time: 0000000063, Query Counter: 0000217063, Search: 88318
FindPosWithSys GetTick Time: 0000000047, Query Counter: 0000190896, Search: 88318
FindPosWithKMP GetTick Time: 0000000110, Query Counter: 0000419372, Search: 88318FindPosWithBM GetTick Time: 0000000078, Query Counter: 0000234192, Search: 88318
FindPosWithSys GetTick Time: 0000000047, Query Counter: 0000150726, Search: 88318
FindPosWithKMP GetTick Time: 0000000140, Query Counter: 0000482356, Search: 88318procedure TForm1.AddFileButtonClick(Sender: TObject);  procedure AddFile(FileName: string);
  const
    TEST_SIZE = 10024 * 1000;
  var
    Buffer: Pointer;
    FileSize, Count: Integer;
  begin
    with TFileStream.Create(FileName, fmOpenReadWrite) do
    try
      FileSize := Size;
      Count := FileSize;
      GetMem(Buffer, Count);
      try
        ReadBuffer(Buffer^, Count);
        Inc(FileSize, Write(Buffer^, Count));
        while TEST_SIZE > FileSize do
          Inc(FileSize, Write(Buffer^, Count));
        Caption := Format('文件大小为：%d', [FileSize]);
      finally
        FreeMem(Buffer);
      end;
    finally
      Free;
    end;
  end;

begin
  AddFile(FileEdit.Text);
end;

//可能吧. 不过我使用下面的代码扫描100m的文件耗时247毫秒相信楼主对这个结果应该满意了吧！^_^unit Unit1;interfaceuses
  Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
  Dialogs, StdCtrls, StrUtils, Math, ZLib;type
  TForm1 = class(TForm)
    Button1: TButton;
    Memo1: TMemo;
    procedure Button1Click(Sender: TObject);
  private
    { Private declarations }
  public
    { Public declarations }
  end;var
  Form1: TForm1;implementation{$R *.dfm}
function KMP(tent,path:String;StartPos:Integer=1;EndPos:Integer=-1):Integer;
var
  f:array of Integer;
  t,i,j,p:Integer;
  lent:Integer;
  lenp:Integer;
begin
  if EndPos>0 then
    Lent:=EndPos
  else
    lent:=Length(tent);
  lenp:=Length(path);
  SetLength(f,lenp+1);
  f[1]:=0;
  for j:=2 to lenp do
  begin
    i:=f[j-1];
    while (path[j]<>path[i+1])and (i>0) do i:=f[i];
    if path[j]=path[i+1] then
      f[j]:=i+1
    else
      f[j]:=0;
  end;
  t:=StartPos;
  p:=1;
  while (t<=lent) and (p<=lenp) do
  begin
    if tent[t]=path[p] then
    begin
      Inc(t);
      Inc(p);
    end
    else if p=1 then
       Inc(t)
    else p:=f[p-1]+1;
  end;
  if p<lenp then
    Result:=-1
  else
    Result:=t-lenp;
end;
procedure FindStrInFile(Str,FileName:String;out Positions:Pointer;out GetPosNum:Integer);
const
  BUFFERBLOCK=10000;//调整这个可以加快文件读取进度
var
  FS:TFileStream;
  Buffer:String;
  i,ReadBegin,ReadSize,ReadLen:Integer;
  BeginPos,Count:Int64;
  SLen:Integer;
begin
  Count:=0;
  GetPosNum:=BUFFERBLOCK;
  GetMem(Positions,SizeOf(Int64)*GetPosNum);
  SLen:=Length(Str);
  ReadBegin:=1;
  ReadSize:=SLen*BUFFERBLOCK;
  BeginPos:=0;
  SetLength(Buffer,ReadSize);
  FS:=TFileStream.Create(FileName,fmOpenRead or fmShareDenyNone);
  try
    try
      while True do
      begin
        ReadLen:=FS.Read(Buffer[ReadBegin],ReadSize);
        if ReadLen>0 then
        begin
          i:=1;
          repeat
            i:=KMP(Buffer,Str,i,ReadBegin+ReadLen-1);
            if i>0 then
            begin
              if Count>GetPosNum then
              begin
                Inc(GetPosNum,BUFFERBLOCK*SizeOf(Int64));
                ReallocMem(Positions,GetPosNum);
              end;
              PInt64(Integer(Positions)+Count*SizeOf(Int64))^:=BeginPos+i;
              Inc(Count);
              Inc(i,SLen);
            end;
          until i<0;
          if ReadLen<ReadSize then Break;
          if Count>0 then
            ReadSize:=Max((BUFFERBLOCK-1)*SLen+1,i+SLen)
          else
            ReadSize:=(BUFFERBLOCK-1)*SLen+1;
          if FS.Size-FS.Position<ReadSize then ReadSize:=FS.Size-FS.Position;
          ReadBegin:=BUFFERBLOCK*SLen-ReadSize+1;
          Move(Buffer[ReadSize+1],Buffer[1],ReadBegin-1);
          BeginPos:=BeginPos+ReadSize;
        end
        else
          Break;
      end;
      GetPosNum:=Count;
      ReallocMem(Positions,GetPosNum*SizeOf(Int64));
    except
      on E:Exception do
      begin
        GetPosNum:=0;
        FreeMem(Positions);
      end;
    end;
  finally
    FS.Free;
  end;
end;
procedure TForm1.Button1Click(Sender: TObject);
var
  ps:Pointer;//搜索匹配窜的确位置序列
  s,e,i,n:Integer;
begin
  s:=GetTickCount;
  FindStrInFile('xml is a good document!',ExtractFilePath(Application.ExeName)+'QTDB2.TXT',ps,n);
  e:=GetTickCount;
  Memo1.Lines.Add('本次扫描耗时'+IntToStr(e-s)+'毫秒');
  Memo1.Lines.Add('扫描到'+IntToStr(n)+'个结果');
  for i:=0 to n-1 do
  begin
    Memo1.Lines.Add('第'+IntToStr(i+1)+'位置在:'+IntToStr(PInt64(Integer(ps)+i*SizeOf(Int64))^));
  end;
  FreeMem(ps);
end;end.

据说全文检索的东西，索引比内容大的多，所以索引要进行再索引。
一般是用HASH做，我也想研究一下，不过，感觉有点不自量力。

请问wizardqi(男巫) ，如果查找的内容是你的100M文件的最后一行的内容，要多少时间，能测一下吗

呵呵,231毫秒,也许是我的电脑配置好了点(DELL品牌机，P4 2.8+1G DDR(硬盘好象是7200转的，其实在这项应用中内存的频率和硬盘的读取效率影响极大).

楼上贴程序的，统统打pp，然后拖出去面壁！请注意了，程序都有问题。你们没有考虑到边界问题。假如buf为1k，模式串是abcde
如果文件的1023处有abcde，那么按照你们的方法是找不到的！
lz可以在MSDN里查MapViewOfFile

2 firstrose
我好像没使用buf吧，就是使用了buf，那是调用者的问题，而不是算法本身的问题。Windows N多API，我没调用好，那是自己的问题，关MS什么事啊。

这么好的贴子为什么没有人加FAQ？

调试易

我想在一个10M的文本文件中搜索指定字符串，请问该怎么做

解决方案 »