一个简单的Html转换Txt程序的基本原理就是--将超文本文件不可视部分去掉,将超文本文件可视部分的内容写到文本文件中去。 好了,有了理论就应该有些实际的东西了。转换的步骤可以简单地归纳为: 1、 去掉$#@60;head$#@62;部分的内容 2、 去掉$#@60;script$#@62;部分的Java脚本程序 3、 转换$#@60;br为换行符 4、 转换$#@60;/p$#@62;为换行符 5、 转换和去掉其它所有的超文本标记 6、 转换“$#@60;”为“<”符号 7、 转换“$#@62;”为“>”符号 8、 转换“&”为“&”符号 9、 转换“ ”为空格符号 10、转换“"”为引号 11、去掉转换后开头和结尾出现的所有空格符号 12、转换完成 转换的步骤可用VB的函数IsStr来实现,如下面的代码可以去掉文本框Text3中超文本文件开头到$#@60;/Head$#@62;标记部分的内容: Do While InStr(1, LCase(Text3.Text), "$#@60;/head$#@62;") <> 0 Text3.SelStart = 0 Text3.SelLength = InStr(1, LCase(Text3.Text), "$#@60;/head$#@62;") + 6 Text3.SelText = "" Loop 详细的程序代码请参看程序清单或源程序。这个转换程序设计时考虑的是转换规范的超文本文件,当要转换的文件不够标准(如:有$#@60;/head$#@62;而没有$#@60;head$#@62;与之配对)的时候,转换就不能完成。而且,这个例子只转换了部分的超文本标记,还有许多的标记,如表单标记“FORM”并没有被转换,更多、更详尽的功能就有待你完成了。附程序清单: Option Explicit Private Sub Form_Load()CommonDialog1.CancelError = True Text3.Visible = False Command1.Capti = "打开" Command2.Caption = "转换==$#@62;" Command3.Caption = "保存"End Sub Private Sub Command1_Click() On Error Resume Next Dim TextLine As String CommonDialog1.Filter = "网页|*.htm;*.html" CommonDialog1.ShowOpen If err $#@60;$#@62; 32755 ThenText1 = "" "打开文件 Open CommonDialog1.FileName For Input As #1 Do While Not EOF(1)Line Input #1, TextLine Text1 = Text1 & Trim(TextLine)Loop Close #1ElseMsgBox "不能打开文件"End IfEnd Sub Private Sub Command3_Click()On Error Resume Next CommonDialog1.Filter = "文本文件|*.txt" CommonDialog1.ShowSave If err $#@60;$#@62; 32755 ThenOpen CommonDialog1.FileName For Output As #1 Print #1, Text3 Close #1ElseMsgBox "不能保存文件"End IfEnd Sub Private Sub Command2_Click()Dim txtStr As String On Error GoTo err Form1.MousePointer = 11 Text3.Text = Text1.Text DoEvents Form1.Caption = "正在去掉$#@60;head$#@62;部分..." "去掉$#@60;head$#@62;部分 Do While InStr(1, LCase(Text3.Text), "$#@60;/head$#@62;") $#@60;$#@62; 0Text3.SelStart = 0 Text3.SelLength = InStr(1, LCase(Text3.Text), "$#@60;/head$#@62;") + 6 Text3.SelText = ""Loop Form1.Caption = "正在去掉$#@60;script$#@62;部分..." "去掉$#@60;script$#@62;部分 Do While InStr(1, LCase(Text3.Text), "$#@60;/script$#@62;") $#@60;$#@62; 0Text3.SelStart = InStr(1, LCase(Text3.Text), "$#@60;script") - 1 Text3.SelLength = InStr(1, LCase(Text3.Text), "$#@60;/script$#@62;") - Text3.SelStart + 9 Text3.SelText = ""Loop Form1.Caption = "正在转换$#@60;br$#@62;为换行符..." "转换$#@60;br$#@62;为换行符 Do While InStr(1, LCase(Text3.Text), "$#@60;br$#@62;") $#@60;$#@62; 0Text3.SelStart = InStr(1, LCase(Text3.Text), "$#@60;br$#@62;") - 1 Text3.SelLength = 4 Text3.SelText = "" + vbCrLfLoop Form1.Caption = "正在转换$#@60;p$#@62;$#@60;/p$#@62;为换行符..." "转换$#@60;/p$#@62;为换行符 Do While InStr(1, LCase(Text3.Text), "$#@60;/p$#@62;") $#@60;$#@62; 0Text3.SelStart = InStr(1, LCase(Text3.Text), "$#@60;/p$#@62;") - 1 Text3.SelLength = 4 Text3.SelText = "" + vbCrLfLoop Form1.Caption = "正在删除Html标记..." "去掉其它的Html标记 Do While InStr(1, LCase(Text3.Text), "$#@60;") $#@60;$#@62; 0Text3.SelStart = InStr(1, LCase(Text3.Text), "$#@60;") - 1 Text3.SelLength = InStr(1, LCase(Text3.Text), "$#@62;") - Text3.SelStart Text3.SelText = ""Loop Form1.Caption = "正在转换"$#@60;"为"$#@60;"..." "转换"$#@60;"为"$#@60;" Do While InStr(1, LCase(Text3.Text), "$#@60;") $#@60;$#@62; 0Text3.SelStart = InStr(1, LCase(Text3.Text), "$#@60;") - 1 Text3.SelLength = 4 Text3.SelText = "$#@60;"Loop Form1.Caption = "正在转换"$#@62;"为"$#@62;"..." "转换"$#@62;"为"$#@62;" Do While InStr(1, LCase(Text3.Text), "$#@62;") $#@60;$#@62; 0Text3.SelStart = InStr(1, LCase(Text3.Text), "$#@62;") - 1 Text3.SelLength = 4 Text3.SelText = "$#@62;"Loop Form1.Caption = "正在转换"&"为"&"..." "转换"&"为"&" Do While InStr(1, LCase(Text3.Text), "&") $#@60;$#@62; 0Text3.SelStart = InStr(1, LCase(Text3.Text), "&") - 1 Text3.SelLength = 5 Text3.SelText = "&"Loop Form1.Caption = "正在转换" "为空格符..." "转换" "为" " Do While InStr(1, LCase(Text3.Text), " ") $#@60;$#@62; 0Text3.SelStart = InStr(1, LCase(Text3.Text), " ") - 1 Text3.SelLength = 6 Text3.SelText = " "Loop Form1.Caption = "正在转换"""为引号..." "转换"""为引号 Do While InStr(1, LCase(Text3.Text), """) $#@60;$#@62; 0Text3.SelStart = InStr(1, LCase(Text3.Text), """) - 1 Text3.SelLength = 6 Text3.SelText = """"Loop "去掉前后空格符 Text2.Text = Trim$(Text3.Text) Form1.Caption = "转换成功!" Form1.MousePointer = 0 Exit Suberr:Form1.Caption = "转换错误" Form1.MousePointer = 0 MsgBox "转换时出错,请检查你的源文件!" Exit SubEnd Sub
提供给你一段程序,这个程序是把一段文本中所有的html标记去调。只能称为是个原型,它没有对script脚本、object进行处理,也没有对那些“ ”一类的标记进行处理,前者需要再程序中再加入一个标记,以进行一段的删除,后者可以再这里的sResult上再作处理Private Sub Command1_Click() Dim bQuoteFound As Boolean '是否找到引号的标志 Dim bSignalFound As Boolean '是否找到<的标志 Dim ch As String Dim sQuote As String Dim sResult As String
sQuote = """" '记录"的字符 For i = 1 To Len(Text1.Text) ch = Mid$(Text1.Text, i, 1) If Not bQuoteFound Then If ch = sQuote Then bQuoteFound = True Else If bSignalFound Then If ch = ">" Then bSignalFound = False End If Else If ch = "<" Then bSignalFound = True Else sResult = sResult + ch End If End If End If Else If ch = sQuote Then bQuoteFound = False Else If Not bSignalFound Then sResult = sResult + ch End If End If End If Next
Text1.Text = sResult End Sub
简单的方法,使用 WebBrowser 控件:With Me.WebBrowser1 .Navigate2 "http://www.csdn.net" While .Busy DoEvents Wend Debug.Print .Document.body.innertext End With
这个效果还不错 Option ExplicitPrivate Sub cmdBrowse_Click() On Error Resume Next dlgFileOpen.Filename = "*.htm" dlgFileOpen.Action = 1 If Err Then Exit Sub txtFile = dlgFileOpen.Filename End Sub Private Sub cmdStripFile_Click() StripText CStr(txtFile) End Sub模块 Option ExplicitSub StripText(Filename$)Dim f%, ff%, t$ Dim percent&, total& Dim is_tag%, write2file$, i% On Error GoTo Err_Handler ' Opens the file, reads the data and saves the lines ' that are text. ' Set the mousepointer to hourglass Screen.MousePointer = 11 ' Get a free file handle f% = FreeFile ' Open the HTML file in read mode Open Filename$ For Input As #f% ' Get a free file handle ff% = FreeFile ' Open the output file name, which is the old filename ' + "txt" Open Left$(Filename$, InStr(Filename$, ".")) & "TXT" For Output As #ff% ' Find the total number of bytes to read total& = LOF(f%) ' Loop through the entire file Do While Not EOF(f%) ' Read one line Line Input #f%, t$ ' Count the number of bytes read (including CR + LF) percent& = percent& + Len(t$) + 2 ' Calculate the percent and show it in the status label If CInt(percent& * 100 / total&) Mod 10 = 0 Then frmMain!lblStatus = "Reading " & CInt(percent& * 100 / total&) & "%" ' Refresh it to make it update on the screen frmMain!lblStatus.Refresh End If ' Now scan the entire string to find the "<"'s and the ">"'s. write2file$ = "" For i% = 1 To Len(t$) Select Case Mid$(t$, i%, 1) Case "<" is_tag% = True Case ">" is_tag% = False Case Else If Not is_tag% Then write2file$ = write2file$ & Mid$(t$, i%, 1) End Select Next ' Write the line to the file Print #ff%, write2file$ ' Next line Loop ' Update label with status frmMain!lblStatus = "Wrote " & Left$(Filename$, InStr(Filename$, ".")) & "TXT"Exit_Sub: ' CLose both file Close #f% Close #ff% ' Reset mousepointer Screen.MousePointer = 0 ' Exit Exit SubErr_Handler: ' if there was an error, display it in the status ' label frmMain!lblStatus = "Error: " & Error$(Err) ' Then exit Resume Exit_SubEnd Sub
简单的方法,使用 WebBrowser 控件:With Me.WebBrowser1 .Navigate2 "http://www.csdn.net" While .Busy DoEvents Wend Debug.Print .Document.body.innertext End With
好了,有了理论就应该有些实际的东西了。转换的步骤可以简单地归纳为:
1、 去掉$#@60;head$#@62;部分的内容
2、 去掉$#@60;script$#@62;部分的Java脚本程序
3、 转换$#@60;br为换行符
4、 转换$#@60;/p$#@62;为换行符
5、 转换和去掉其它所有的超文本标记
6、 转换“$#@60;”为“<”符号
7、 转换“$#@62;”为“>”符号
8、 转换“&”为“&”符号
9、 转换“ ”为空格符号
10、转换“"”为引号
11、去掉转换后开头和结尾出现的所有空格符号
12、转换完成
转换的步骤可用VB的函数IsStr来实现,如下面的代码可以去掉文本框Text3中超文本文件开头到$#@60;/Head$#@62;标记部分的内容:
Do While InStr(1, LCase(Text3.Text), "$#@60;/head$#@62;") <> 0
Text3.SelStart = 0
Text3.SelLength = InStr(1, LCase(Text3.Text), "$#@60;/head$#@62;") + 6
Text3.SelText = ""
Loop
详细的程序代码请参看程序清单或源程序。这个转换程序设计时考虑的是转换规范的超文本文件,当要转换的文件不够标准(如:有$#@60;/head$#@62;而没有$#@60;head$#@62;与之配对)的时候,转换就不能完成。而且,这个例子只转换了部分的超文本标记,还有许多的标记,如表单标记“FORM”并没有被转换,更多、更详尽的功能就有待你完成了。附程序清单:
Option Explicit
Private Sub Form_Load()CommonDialog1.CancelError = True
Text3.Visible = False
Command1.Capti = "打开"
Command2.Caption = "转换==$#@62;"
Command3.Caption = "保存"End Sub
Private Sub Command1_Click()
On Error Resume Next
Dim TextLine As String
CommonDialog1.Filter = "网页|*.htm;*.html"
CommonDialog1.ShowOpen
If err $#@60;$#@62; 32755 ThenText1 = ""
"打开文件
Open CommonDialog1.FileName For Input As #1
Do While Not EOF(1)Line Input #1, TextLine
Text1 = Text1 & Trim(TextLine)Loop
Close #1ElseMsgBox "不能打开文件"End IfEnd Sub
Private Sub Command3_Click()On Error Resume Next
CommonDialog1.Filter = "文本文件|*.txt"
CommonDialog1.ShowSave
If err $#@60;$#@62; 32755 ThenOpen CommonDialog1.FileName For Output As #1
Print #1, Text3
Close #1ElseMsgBox "不能保存文件"End IfEnd Sub
Private Sub Command2_Click()Dim txtStr As String
On Error GoTo err
Form1.MousePointer = 11
Text3.Text = Text1.Text
DoEvents
Form1.Caption = "正在去掉$#@60;head$#@62;部分..."
"去掉$#@60;head$#@62;部分
Do While InStr(1, LCase(Text3.Text), "$#@60;/head$#@62;") $#@60;$#@62; 0Text3.SelStart = 0
Text3.SelLength = InStr(1, LCase(Text3.Text), "$#@60;/head$#@62;") + 6
Text3.SelText = ""Loop
Form1.Caption = "正在去掉$#@60;script$#@62;部分..."
"去掉$#@60;script$#@62;部分
Do While InStr(1, LCase(Text3.Text), "$#@60;/script$#@62;") $#@60;$#@62; 0Text3.SelStart = InStr(1, LCase(Text3.Text), "$#@60;script") - 1
Text3.SelLength = InStr(1, LCase(Text3.Text), "$#@60;/script$#@62;") - Text3.SelStart + 9
Text3.SelText = ""Loop
Form1.Caption = "正在转换$#@60;br$#@62;为换行符..."
"转换$#@60;br$#@62;为换行符
Do While InStr(1, LCase(Text3.Text), "$#@60;br$#@62;") $#@60;$#@62; 0Text3.SelStart = InStr(1, LCase(Text3.Text), "$#@60;br$#@62;") - 1
Text3.SelLength = 4
Text3.SelText = "" + vbCrLfLoop
Form1.Caption = "正在转换$#@60;p$#@62;$#@60;/p$#@62;为换行符..."
"转换$#@60;/p$#@62;为换行符
Do While InStr(1, LCase(Text3.Text), "$#@60;/p$#@62;") $#@60;$#@62; 0Text3.SelStart = InStr(1, LCase(Text3.Text), "$#@60;/p$#@62;") - 1
Text3.SelLength = 4
Text3.SelText = "" + vbCrLfLoop
Form1.Caption = "正在删除Html标记..."
"去掉其它的Html标记
Do While InStr(1, LCase(Text3.Text), "$#@60;") $#@60;$#@62; 0Text3.SelStart = InStr(1, LCase(Text3.Text), "$#@60;") - 1
Text3.SelLength = InStr(1, LCase(Text3.Text), "$#@62;") - Text3.SelStart
Text3.SelText = ""Loop
Form1.Caption = "正在转换"$#@60;"为"$#@60;"..."
"转换"$#@60;"为"$#@60;"
Do While InStr(1, LCase(Text3.Text), "$#@60;") $#@60;$#@62; 0Text3.SelStart = InStr(1, LCase(Text3.Text), "$#@60;") - 1
Text3.SelLength = 4
Text3.SelText = "$#@60;"Loop
Form1.Caption = "正在转换"$#@62;"为"$#@62;"..."
"转换"$#@62;"为"$#@62;"
Do While InStr(1, LCase(Text3.Text), "$#@62;") $#@60;$#@62; 0Text3.SelStart = InStr(1, LCase(Text3.Text), "$#@62;") - 1
Text3.SelLength = 4
Text3.SelText = "$#@62;"Loop
Form1.Caption = "正在转换"&"为"&"..."
"转换"&"为"&"
Do While InStr(1, LCase(Text3.Text), "&") $#@60;$#@62; 0Text3.SelStart = InStr(1, LCase(Text3.Text), "&") - 1
Text3.SelLength = 5
Text3.SelText = "&"Loop
Form1.Caption = "正在转换" "为空格符..."
"转换" "为" "
Do While InStr(1, LCase(Text3.Text), " ") $#@60;$#@62; 0Text3.SelStart = InStr(1, LCase(Text3.Text), " ") - 1
Text3.SelLength = 6
Text3.SelText = " "Loop
Form1.Caption = "正在转换"""为引号..."
"转换"""为引号
Do While InStr(1, LCase(Text3.Text), """) $#@60;$#@62; 0Text3.SelStart = InStr(1, LCase(Text3.Text), """) - 1
Text3.SelLength = 6
Text3.SelText = """"Loop
"去掉前后空格符
Text2.Text = Trim$(Text3.Text)
Form1.Caption = "转换成功!"
Form1.MousePointer = 0
Exit Suberr:Form1.Caption = "转换错误"
Form1.MousePointer = 0
MsgBox "转换时出错,请检查你的源文件!"
Exit SubEnd Sub
Dim bQuoteFound As Boolean '是否找到引号的标志
Dim bSignalFound As Boolean '是否找到<的标志
Dim ch As String
Dim sQuote As String
Dim sResult As String
sQuote = """" '记录"的字符
For i = 1 To Len(Text1.Text)
ch = Mid$(Text1.Text, i, 1)
If Not bQuoteFound Then
If ch = sQuote Then
bQuoteFound = True
Else
If bSignalFound Then
If ch = ">" Then
bSignalFound = False
End If
Else
If ch = "<" Then
bSignalFound = True
Else
sResult = sResult + ch
End If
End If
End If
Else
If ch = sQuote Then
bQuoteFound = False
Else
If Not bSignalFound Then
sResult = sResult + ch
End If
End If
End If
Next
Text1.Text = sResult
End Sub
.Navigate2 "http://www.csdn.net"
While .Busy
DoEvents
Wend
Debug.Print .Document.body.innertext
End With
Option ExplicitPrivate Sub cmdBrowse_Click() On Error Resume Next
dlgFileOpen.Filename = "*.htm"
dlgFileOpen.Action = 1
If Err Then Exit Sub
txtFile = dlgFileOpen.Filename
End Sub
Private Sub cmdStripFile_Click()
StripText CStr(txtFile)
End Sub模块
Option ExplicitSub StripText(Filename$)Dim f%, ff%, t$
Dim percent&, total&
Dim is_tag%, write2file$, i% On Error GoTo Err_Handler ' Opens the file, reads the data and saves the lines
' that are text. ' Set the mousepointer to hourglass
Screen.MousePointer = 11 ' Get a free file handle
f% = FreeFile
' Open the HTML file in read mode
Open Filename$ For Input As #f%
' Get a free file handle
ff% = FreeFile
' Open the output file name, which is the old filename
' + "txt"
Open Left$(Filename$, InStr(Filename$, ".")) & "TXT" For Output As #ff%
' Find the total number of bytes to read
total& = LOF(f%)
' Loop through the entire file
Do While Not EOF(f%)
' Read one line
Line Input #f%, t$
' Count the number of bytes read (including CR + LF)
percent& = percent& + Len(t$) + 2
' Calculate the percent and show it in the status label
If CInt(percent& * 100 / total&) Mod 10 = 0 Then
frmMain!lblStatus = "Reading " & CInt(percent& * 100 / total&) & "%"
' Refresh it to make it update on the screen
frmMain!lblStatus.Refresh
End If
' Now scan the entire string to find the "<"'s and the ">"'s.
write2file$ = ""
For i% = 1 To Len(t$)
Select Case Mid$(t$, i%, 1)
Case "<"
is_tag% = True
Case ">"
is_tag% = False
Case Else
If Not is_tag% Then write2file$ = write2file$ & Mid$(t$, i%, 1)
End Select
Next
' Write the line to the file
Print #ff%, write2file$
' Next line
Loop
' Update label with status
frmMain!lblStatus = "Wrote " & Left$(Filename$, InStr(Filename$, ".")) & "TXT"Exit_Sub:
' CLose both file
Close #f%
Close #ff%
' Reset mousepointer
Screen.MousePointer = 0
' Exit
Exit SubErr_Handler:
' if there was an error, display it in the status
' label
frmMain!lblStatus = "Error: " & Error$(Err)
' Then exit
Resume Exit_SubEnd Sub
.Navigate2 "http://www.csdn.net"
While .Busy
DoEvents
Wend
Debug.Print .Document.body.innertext
End With