处理特大文本文件是一个老话题,在VB版本经常可以见到相关问题。
以文本文件方式一行一行读取速度慢、一次读入内存又占用内存空间。
TID是给大型文本文件建立索引信息的工具。根据索引信息,可以迅速直接获取文件中任意一行或多行的文本。
之前所编写的TID 1.0(参见我博客)以及现在正在写的TID 2.0虽然功能更强,但是在太臃肿,令人看起来眼花缭乱。
针对最常用的功能,现编写一个最简单的形式。它可以解决如下问题:
1、建立索引信息。
1、根据索引信息返回指定行号范围的文本文件内容。
2、根据索引信息获取文本文件的行数。
Option Explicit
'TID简易版Private priLocate() As LongPrivate Sub Command1_Click()
'给YJ.txt建立索引。
'YJ.txt就是大唐三藏法师所取真经《瑜伽师地论》。容量1.8MB。
'此文本行数多(固定断行)、繁体字多,是我研究文本操作的标准测试文本。
priLocate() = MakeLoacte("YJ.txt", Text3)
Text1.Text = UBound(priLocate())
HScroll1.Min = 1
HScroll1.Max = UBound(priLocate())
End SubPrivate Sub Command2_Click()
'保存索引信息到Temp.ind。
LoacteSave "Temp.ind", priLocate()
End SubPrivate Sub Command3_Click()
'从Temp.ind读取索引信息
priLocate() = LoacteLoad("Temp.ind")
Text1.Text = UBound(priLocate())
HScroll1.Min = 1
HScroll1.Max = UBound(priLocate())
End SubPrivate Sub Command4_Click()
'读《瑜伽师地论》前100行。
Dim tFN As Integer
Dim tString As String
tFN = FreeFile
Open "YJ.txt" For Binary As tFN
tString = MultiLineGet(tFN, priLocate(), 1, 100)
If Not tString = "" Then
Text2.Text = StrConv(tString, vbUnicode)
Else
Text2.Text = tString
End If
Close tFN
End SubPrivate Sub Command5_Click()
'用一个2GB虚拟机文件测试。
'测试机型为P4E HT 3.0GHz(本人家里最破的电脑)。编译后为2GB文件建立索引大约需要60-70秒。处理能力大约30MB/s。
'欢迎编译后以不同机型测试。
'(本程序处理文件极限容量为2GB,因为Long类型就这么大了。对超过2GB的文本文件无能为力。)
priLocate() = MakeLoacte("Windows 98-s001.vmdk", Text3)
Text1.Text = UBound(priLocate())
End SubPrivate Sub HScroll1_Change()
'根据滚动条动态取随意行。
Dim tFN As Integer
Dim tString As String
tFN = FreeFile
Open "YJ.txt" For Binary As tFN
tString = LineGet(tFN, priLocate(), HScroll1.Value)
If Not tString = "" Then
Text1.Text = StrConv(tString, vbUnicode)
Else
Text1.Text = tString
End If
Close tFN
End Sub'以上是测试代码,以下是函数Public Function MultiLineGet(ByVal pFileNumber As Integer, ByRef pLocate() As Long, ByVal pStart As Long, ByVal pEnd As Long) As Byte()
'获取多行
Dim tBytes() As Byte
Dim tBytesLength As Long
Dim tTextStart As Long
Dim tTextSize As Long
Dim tLineStart As Long
Dim tLineSize As Long
Dim tLineCount As Long
Dim tIndexOnBound As Long tLineCount = UBound(pLocate())
tIndexOnBound = pStart > 0 And pStart <= tLineCount And pEnd > 0 And pEnd <= tLineCount And pEnd >= pStart If tIndexOnBound Then
tLineStart = pLocate(pStart - 1) + 2
tLineSize = pLocate(pEnd) - tLineStart
If tLineSize > 0 Then
tBytesLength = tLineSize - 1
ReDim tBytes(tBytesLength)
Get pFileNumber, tLineStart, tBytes()
End If
End If MultiLineGet = tBytes()
End FunctionPublic Function LineGet(ByVal pFileNumber As Integer, ByRef pLocate() As Long, ByVal pIndex As Long) As Byte()
'获取行
Dim tBytes() As Byte
Dim tBytesLength As Long
Dim tLineStart As Long
Dim tLineSize As Long
Dim tLineCount As Long
Dim tIndexOnBound As Long
tLineCount = UBound(pLocate())
tIndexOnBound = pIndex > 0 And pIndex <= tLineCount
If tIndexOnBound Then
tLineStart = pLocate(pIndex - 1) + 2
tLineSize = pLocate(pIndex) - tLineStart
If tLineSize > 0 Then
tBytesLength = tLineSize - 1
ReDim tBytes(tBytesLength)
Get pFileNumber, tLineStart, tBytes()
End If
End If
LineGet = tBytes()
End FunctionPublic Function LoacteLoad(ByVal pFile As String) As Long()
'LoacteLoad函数
'格式:[tLoacte() =]LoacteLoad(pFile)
'功能:从定位数据文件获取已保存的分隔符定位表。
'参数: pFile string 定位数据文件名
'返回: tLoacte() long 分隔符绝对地址定位表
Dim tLocate() As Long
Dim tFileNumber As Integer
tFileNumber = FreeFile
Open pFile For Binary As tFileNumber
ReDim tLocate((LOF(tFileNumber) - 1) \ 4)
Get tFileNumber, 1, tLocate()
Close tFileNumber
LoacteLoad = tLocate()
End FunctionPublic Sub LoacteSave(ByVal pFile As String, ByRef pLoacte() As Long)
'LoacteSave过程
'格式:LoacteSave pFile, pLoacte()
'功能:保存分隔符定位表到定位数据文件。
'参数: pFile string 定位数据文件名
'返回: pLoacte() long 定位表
'说明:使用此函数前一定要对目标文件进行检查,否则它会没有任何预告地清0文件写入信息。
Dim tFileNumber As Integer
'清0文件
tFileNumber = FreeFile
Open pFile For Output As tFileNumber
Close tFileNumber
'写入文件
tFileNumber = FreeFile
Open pFile For Binary As tFileNumber
Put tFileNumber, 1, pLoacte()
Close tFileNumber
End SubPublic Function MakeLoacte(ByVal pFile As String, Optional pTextBox As TextBox = Nothing) As Long()
'MakeLoacte函数
'格式:[tLoacte() =]MakeLoacte(pFile)
'功能:为文本文件建立分隔符定位表。
'参数: pFile string 文本文件名
'返回: tLoacte() long 分隔符绝对地址定位表
'说明:定位表的第0个元素肯定为-1,定位表的最后一个元素必定是文件长度+1。
Dim tLoacte() As Long
Dim tLoacteLength As Long
Dim tLoacteIndex As Long
Dim tFileNumber As Integer
Dim tBuffer() As Byte
Dim tBufferIndex As Long
Dim tBufferLength As Long
Dim tPageSize As Long
Dim tPageLength As Long
Dim tPageIndex As Long
Dim tFileSize As Long
Dim tFileLength As Long
Dim tLoadStart As Long
tPageSize = &H10000
tFileNumber = FreeFile
tLoacteLength = &HFFFF&
ReDim tLoacte(tLoacteLength)
Open pFile For Binary As tFileNumber
'计算64KB分页数量
tFileSize = LOF(tFileNumber): tFileLength = tFileSize - 1
tPageLength = (tFileLength) \ tPageSize
tLoacteIndex = 0: tLoacte(0) = -1
'历遍64KB分页
For tPageIndex = 0 To tPageLength
If tPageIndex = tPageLength Then
tBufferLength = (tFileLength) Mod tPageSize
Else
tBufferLength = tPageSize - 1
End If
ReDim tBuffer(tBufferLength)
tLoadStart = tPageIndex * tPageSize + 1 '当前页面基址(从VB实际读取地址1开始计算)
Get tFileNumber, tLoadStart, tBuffer()
'扫描页内字节分隔符
For tBufferIndex = 0 To tBufferLength
If tBuffer(tBufferIndex) = &HD Then
tLoacteIndex = tLoacteIndex + 1
If tLoacteIndex > tLoacteLength Then
tLoacteLength = tLoacteLength + &H10000
ReDim Preserve tLoacte(tLoacteLength)
End If
tLoacte(tLoacteIndex) = tLoadStart + tBufferIndex '绝对地址=页面基址+页面相对地址
End If
Next
If Not (pTextBox Is Nothing) Then
pTextBox.Text = Int(tPageIndex * 100 / tPageLength) & "%"
DoEvents
End If
Next
tLoacteIndex = tLoacteIndex + 1
tLoacte(tLoacteIndex) = LOF(tFileNumber) + 1
ReDim Preserve tLoacte(tLoacteIndex)
Close tFileNumber
MakeLoacte = tLoacte()
End Function
以文本文件方式一行一行读取速度慢、一次读入内存又占用内存空间。
TID是给大型文本文件建立索引信息的工具。根据索引信息,可以迅速直接获取文件中任意一行或多行的文本。
之前所编写的TID 1.0(参见我博客)以及现在正在写的TID 2.0虽然功能更强,但是在太臃肿,令人看起来眼花缭乱。
针对最常用的功能,现编写一个最简单的形式。它可以解决如下问题:
1、建立索引信息。
1、根据索引信息返回指定行号范围的文本文件内容。
2、根据索引信息获取文本文件的行数。
Option Explicit
'TID简易版Private priLocate() As LongPrivate Sub Command1_Click()
'给YJ.txt建立索引。
'YJ.txt就是大唐三藏法师所取真经《瑜伽师地论》。容量1.8MB。
'此文本行数多(固定断行)、繁体字多,是我研究文本操作的标准测试文本。
priLocate() = MakeLoacte("YJ.txt", Text3)
Text1.Text = UBound(priLocate())
HScroll1.Min = 1
HScroll1.Max = UBound(priLocate())
End SubPrivate Sub Command2_Click()
'保存索引信息到Temp.ind。
LoacteSave "Temp.ind", priLocate()
End SubPrivate Sub Command3_Click()
'从Temp.ind读取索引信息
priLocate() = LoacteLoad("Temp.ind")
Text1.Text = UBound(priLocate())
HScroll1.Min = 1
HScroll1.Max = UBound(priLocate())
End SubPrivate Sub Command4_Click()
'读《瑜伽师地论》前100行。
Dim tFN As Integer
Dim tString As String
tFN = FreeFile
Open "YJ.txt" For Binary As tFN
tString = MultiLineGet(tFN, priLocate(), 1, 100)
If Not tString = "" Then
Text2.Text = StrConv(tString, vbUnicode)
Else
Text2.Text = tString
End If
Close tFN
End SubPrivate Sub Command5_Click()
'用一个2GB虚拟机文件测试。
'测试机型为P4E HT 3.0GHz(本人家里最破的电脑)。编译后为2GB文件建立索引大约需要60-70秒。处理能力大约30MB/s。
'欢迎编译后以不同机型测试。
'(本程序处理文件极限容量为2GB,因为Long类型就这么大了。对超过2GB的文本文件无能为力。)
priLocate() = MakeLoacte("Windows 98-s001.vmdk", Text3)
Text1.Text = UBound(priLocate())
End SubPrivate Sub HScroll1_Change()
'根据滚动条动态取随意行。
Dim tFN As Integer
Dim tString As String
tFN = FreeFile
Open "YJ.txt" For Binary As tFN
tString = LineGet(tFN, priLocate(), HScroll1.Value)
If Not tString = "" Then
Text1.Text = StrConv(tString, vbUnicode)
Else
Text1.Text = tString
End If
Close tFN
End Sub'以上是测试代码,以下是函数Public Function MultiLineGet(ByVal pFileNumber As Integer, ByRef pLocate() As Long, ByVal pStart As Long, ByVal pEnd As Long) As Byte()
'获取多行
Dim tBytes() As Byte
Dim tBytesLength As Long
Dim tTextStart As Long
Dim tTextSize As Long
Dim tLineStart As Long
Dim tLineSize As Long
Dim tLineCount As Long
Dim tIndexOnBound As Long tLineCount = UBound(pLocate())
tIndexOnBound = pStart > 0 And pStart <= tLineCount And pEnd > 0 And pEnd <= tLineCount And pEnd >= pStart If tIndexOnBound Then
tLineStart = pLocate(pStart - 1) + 2
tLineSize = pLocate(pEnd) - tLineStart
If tLineSize > 0 Then
tBytesLength = tLineSize - 1
ReDim tBytes(tBytesLength)
Get pFileNumber, tLineStart, tBytes()
End If
End If MultiLineGet = tBytes()
End FunctionPublic Function LineGet(ByVal pFileNumber As Integer, ByRef pLocate() As Long, ByVal pIndex As Long) As Byte()
'获取行
Dim tBytes() As Byte
Dim tBytesLength As Long
Dim tLineStart As Long
Dim tLineSize As Long
Dim tLineCount As Long
Dim tIndexOnBound As Long
tLineCount = UBound(pLocate())
tIndexOnBound = pIndex > 0 And pIndex <= tLineCount
If tIndexOnBound Then
tLineStart = pLocate(pIndex - 1) + 2
tLineSize = pLocate(pIndex) - tLineStart
If tLineSize > 0 Then
tBytesLength = tLineSize - 1
ReDim tBytes(tBytesLength)
Get pFileNumber, tLineStart, tBytes()
End If
End If
LineGet = tBytes()
End FunctionPublic Function LoacteLoad(ByVal pFile As String) As Long()
'LoacteLoad函数
'格式:[tLoacte() =]LoacteLoad(pFile)
'功能:从定位数据文件获取已保存的分隔符定位表。
'参数: pFile string 定位数据文件名
'返回: tLoacte() long 分隔符绝对地址定位表
Dim tLocate() As Long
Dim tFileNumber As Integer
tFileNumber = FreeFile
Open pFile For Binary As tFileNumber
ReDim tLocate((LOF(tFileNumber) - 1) \ 4)
Get tFileNumber, 1, tLocate()
Close tFileNumber
LoacteLoad = tLocate()
End FunctionPublic Sub LoacteSave(ByVal pFile As String, ByRef pLoacte() As Long)
'LoacteSave过程
'格式:LoacteSave pFile, pLoacte()
'功能:保存分隔符定位表到定位数据文件。
'参数: pFile string 定位数据文件名
'返回: pLoacte() long 定位表
'说明:使用此函数前一定要对目标文件进行检查,否则它会没有任何预告地清0文件写入信息。
Dim tFileNumber As Integer
'清0文件
tFileNumber = FreeFile
Open pFile For Output As tFileNumber
Close tFileNumber
'写入文件
tFileNumber = FreeFile
Open pFile For Binary As tFileNumber
Put tFileNumber, 1, pLoacte()
Close tFileNumber
End SubPublic Function MakeLoacte(ByVal pFile As String, Optional pTextBox As TextBox = Nothing) As Long()
'MakeLoacte函数
'格式:[tLoacte() =]MakeLoacte(pFile)
'功能:为文本文件建立分隔符定位表。
'参数: pFile string 文本文件名
'返回: tLoacte() long 分隔符绝对地址定位表
'说明:定位表的第0个元素肯定为-1,定位表的最后一个元素必定是文件长度+1。
Dim tLoacte() As Long
Dim tLoacteLength As Long
Dim tLoacteIndex As Long
Dim tFileNumber As Integer
Dim tBuffer() As Byte
Dim tBufferIndex As Long
Dim tBufferLength As Long
Dim tPageSize As Long
Dim tPageLength As Long
Dim tPageIndex As Long
Dim tFileSize As Long
Dim tFileLength As Long
Dim tLoadStart As Long
tPageSize = &H10000
tFileNumber = FreeFile
tLoacteLength = &HFFFF&
ReDim tLoacte(tLoacteLength)
Open pFile For Binary As tFileNumber
'计算64KB分页数量
tFileSize = LOF(tFileNumber): tFileLength = tFileSize - 1
tPageLength = (tFileLength) \ tPageSize
tLoacteIndex = 0: tLoacte(0) = -1
'历遍64KB分页
For tPageIndex = 0 To tPageLength
If tPageIndex = tPageLength Then
tBufferLength = (tFileLength) Mod tPageSize
Else
tBufferLength = tPageSize - 1
End If
ReDim tBuffer(tBufferLength)
tLoadStart = tPageIndex * tPageSize + 1 '当前页面基址(从VB实际读取地址1开始计算)
Get tFileNumber, tLoadStart, tBuffer()
'扫描页内字节分隔符
For tBufferIndex = 0 To tBufferLength
If tBuffer(tBufferIndex) = &HD Then
tLoacteIndex = tLoacteIndex + 1
If tLoacteIndex > tLoacteLength Then
tLoacteLength = tLoacteLength + &H10000
ReDim Preserve tLoacte(tLoacteLength)
End If
tLoacte(tLoacteIndex) = tLoadStart + tBufferIndex '绝对地址=页面基址+页面相对地址
End If
Next
If Not (pTextBox Is Nothing) Then
pTextBox.Text = Int(tPageIndex * 100 / tPageLength) & "%"
DoEvents
End If
Next
tLoacteIndex = tLoacteIndex + 1
tLoacte(tLoacteIndex) = LOF(tFileNumber) + 1
ReDim Preserve tLoacte(tLoacteIndex)
Close tFileNumber
MakeLoacte = tLoacte()
End Function
解决方案 »
免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货