在网上采集数据,我用的是For i = 1 To 50
URL = "http://148.36.19.209:81/exam/ExamPaperByStep.aspx?paper_id=173&_id=62227&ptype=0&Str_XuHao=" & i
strData = getHtmlStr(URL)
Set reg = CreateObject("vbscript.regExp")
reg.Global = True
reg.IgnoreCase = True
reg.MultiLine = True
reg.Pattern = "(?:<.*?>| )" Print #1, reg.Replace(strData, "")
NextPublic Function getHtmlStr(ByVal strUrl As String) As String
Dim XmlHttp As Object
Set XmlHttp = CreateObject("Microsoft.XMLHTTP")
XmlHttp.Open "GET", strUrl, False
On Error GoTo Err_net
XmlHttp.send
getHtmlStr = BytesToBstr(XmlHttp.ResponseBody, "UTF-8")
Set XmlHttp = Nothing
Err_net:
End Function
Private Function BytesToBstr(strBody, codeBase) As String
Dim objStream As Object
Set objStream = CreateObject("Adodb.Stream")
objStream.Type = 1
objStream.Mode = 3
objStream.Open
objStream.Write strBody
objStream.position = 0
objStream.Type = 2
objStream.Charset = codeBase
BytesToBstr = objStream.ReadText
objStream.Close
Set objStream = Nothing
End Function
采集后我有二个问题来问一下,为什么采集的数据里还有一些网页的源码??如setTimeout('top.moveTo(0,0)',50);
setTimeout('top.resizeTo(screen.availWidth,screen.availHeight)',50);
setTimeout("self.focus()",100);
等等??当然这只是一个疑问!希望那位解释一下!
第二个问题就要大家帮忙了,我想把里面的类似的一段给提出来,其有个共同点就是 我要的数据在"数据"和"标识"这两个字之间, 怎么写正则??我试了几下都没法
URL = "http://148.36.19.209:81/exam/ExamPaperByStep.aspx?paper_id=173&_id=62227&ptype=0&Str_XuHao=" & i
strData = getHtmlStr(URL)
Set reg = CreateObject("vbscript.regExp")
reg.Global = True
reg.IgnoreCase = True
reg.MultiLine = True
reg.Pattern = "(?:<.*?>| )" Print #1, reg.Replace(strData, "")
NextPublic Function getHtmlStr(ByVal strUrl As String) As String
Dim XmlHttp As Object
Set XmlHttp = CreateObject("Microsoft.XMLHTTP")
XmlHttp.Open "GET", strUrl, False
On Error GoTo Err_net
XmlHttp.send
getHtmlStr = BytesToBstr(XmlHttp.ResponseBody, "UTF-8")
Set XmlHttp = Nothing
Err_net:
End Function
Private Function BytesToBstr(strBody, codeBase) As String
Dim objStream As Object
Set objStream = CreateObject("Adodb.Stream")
objStream.Type = 1
objStream.Mode = 3
objStream.Open
objStream.Write strBody
objStream.position = 0
objStream.Type = 2
objStream.Charset = codeBase
BytesToBstr = objStream.ReadText
objStream.Close
Set objStream = Nothing
End Function
采集后我有二个问题来问一下,为什么采集的数据里还有一些网页的源码??如setTimeout('top.moveTo(0,0)',50);
setTimeout('top.resizeTo(screen.availWidth,screen.availHeight)',50);
setTimeout("self.focus()",100);
等等??当然这只是一个疑问!希望那位解释一下!
第二个问题就要大家帮忙了,我想把里面的类似的一段给提出来,其有个共同点就是 我要的数据在"数据"和"标识"这两个字之间, 怎么写正则??我试了几下都没法
URL = "http://148.36.19.209:81/exam/ExamPaperByStep.aspx?paper_id=173&_id=62227&ptype=0&Str_XuHao=" & i
strData = getHtmlStr(URL)
Set reg = CreateObject("vbscript.regExp")
reg.Global = True
reg.IgnoreCase = True
reg.MultiLine = True
reg.Pattern = "题型(.*?)标识"
s = reg.Replace(strData, "")
strData = s
Set reg = CreateObject("vbscript.regExp")
reg.Global = True
reg.IgnoreCase = True
reg.MultiLine = True
reg.Pattern = "(?:<.*?>| )"
Print #1, reg.Replace(strData, "")
Next发现只有reg.Pattern = "(?:<.*?>| )"
起作用,单独用reg.Pattern = "题型(.*?)标识",是一点儿作用也不起啊
URL = "http://148.36.19.209:81/exam/ExamPaperByStep.aspx?paper_id=173&_id=62227&ptype=0&Str_XuHao=" & i
strData = getHtmlStr(URL)
Set reg = CreateObject("vbscript.regExp")
reg.Global = True
reg.IgnoreCase = True
reg.MultiLine = True
reg.Pattern = "<script .*?>[\s\S]*?</script>"
strData = reg.Replace(strData, "")
reg.Pattern = "(?:<.*?>| )"
strData = reg.Replace(strData, "")
Print #1, strData
Next其他类似问题可能还有css标签块等。
备注:
是UTF8编码,
Private Sub TestReg()
Dim strData As String
Dim reg As Object
Dim matchs As Object, match As Object strData = "我要的数据在""数据""和""标识""这两个字之间, 怎么写正则??我试了几下都没法" Set reg = CreateObject("vbscript.regExp")
reg.Global = True
reg.IgnoreCase = True
reg.MultiLine = True
reg.Pattern = "数据([\s\S]*?)标识"
Set matchs = reg.Execute(strData)
For Each match In matchs
'Debug.Print match.Value
Debug.Print match.SubMatches(0)
Next
End Sub
<BODY>
......................................
<P style="LINE-HEIGHT: 1.5; TEXT-ALIGN: left">感谢您百忙之中上线................
....................................
<P style="LINE-HEIGHT: 1.5; TEXT-ALIGN: left">通过 <STRONG>题库练习</STRONG>,.........................................................
我认为没有用的源码省了,我要的就是把红字中间的汉字单独提出来,如何?
<head><title>
**财税网上学校
</title>
<script type="text/javascript">
setTimeout('top.moveTo(0,0)',50);
setTimeout('top.resizeTo(screen.availWidth,screen.availHeight)',50);
setTimeout("self.focus()",100);
</script><link runat="server" id="CssSkin" href="../Skin/Skin023Zjjds/Skin023Zjjds.css" type="text/css" rel="stylesheet" text="text/css" /></head>
<script language="javascript" type ="text/javascript">
var oInterval="";
//中间的源码省去!
<span id="Label17" style="font-size:Larger;font-size: 14px">本卷满分为</span>
<input name="tbxMarkFull" type="text" value="100" id="tbxMarkFull" disabled="disabled" class="textbox3" style="height:21px;width:40px;" />分</td>
<td style="width:100px;height: 5px">
<input type="submit" name="Button_SJTJ" value="交卷" onclick="javascript:return confirm('您确定要提交么?');" id="Button_SJTJ" class="btn01" style="height:40px;width:80px;font-size: 14px" /></td>
<td style=" width:100px;height: 5px; ">
<a onclick="javascript:alert('时间已到,系统强制收卷,谢谢您的参与!');" id="Button_QiangZhi" href="javascript:__doPostBack('Button_QiangZhi','')"></a></td>
<td style="height: 5px">
</td>
</tr>
<tr>
<td style="height: 1px" colspan="7" valign="bottom">
<span id="Label_PaperName" style="display:inline-block;height:20px;width:100%;font-weight: bold; vertical-align: baseline; text-align: center">一月份电子考场</span></td>
</tr>
</table> <br />
<table id="Table_List" width="750" style="font-size: 14px">
<tr>
<td>
<span id="Label3">题目总数</span></td>
<td>
<span id="Label5">做过题目数</span></td>
<td>
<span id="Label7">剩余题目数</span></td>
<td>
<span id="Label9"></span></td>
<td>
<span id="Label12"></span></td>
<td>
<span id="Label13"></span></td>
</tr>
<tr>
<td>
<span id="lblZS">50</span></td>
<td>
<span id="lblZG">1</span></td>
<td>
<span id="lblSY">49</span></td>
<td>
<span id="lblHG"></span></td>
<td>
<span id="lblYD"></span></td>
<td>
<span id="lblIsHG"></span></td>
</tr>
</table> <br />
<table id="Table_List2" border="0" style="border-color:#C0C0FF;border-width:1px;border-style:Solid;width:750px;font-size: 14px"></table>
<table id="Table_Exam" border="0" style="border-color:#C0C0FF;border-width:1px;border-style:Solid;width:750px;font-size: 14px">
<tr style="border-width:1px;border-style:solid;">
<td><span style="font-weight:bold;">题型:单选题 本题分数: 2分</span></td>
</tr><tr>
<td><span>2、根据企业所得税法的规定,除固定资产改建支出、大修理支出外的其他长期待摊费用摊销年限最低为( )。</span></td>
</tr><tr>
<td><table id="1446815" border="0">
<tr>
<td><input id="1446815_0" type="radio" name="1446815" value="A、3年" /><label for="1446815_0">A、3年</label></td>
</tr><tr>
<td><input id="1446815_1" type="radio" name="1446815" value="B、4年" /><label for="1446815_1">B、4年</label></td>
</tr><tr>
<td><input id="1446815_2" type="radio" name="1446815" value="C、5年" /><label for="1446815_2">C、5年</label></td>
</tr><tr>
<td><input id="1446815_3" type="radio" name="1446815" value="D、10年" /><label for="1446815_3">D、10年</label></td>
</tr>
</table></td>
</tr>
</table>
<table id="tblBtn" border="0" cellpadding="0" cellspacing="0" style="width: 750px;">
<tr>
<td colspan="6" rowspan="3">
<br />
<input type="submit" name="btnOK" value="确定" id="btnOK" class="btn" />
<input type="submit" name="btnJump" value="跳过此题" id="btnJump" class="btn" />
<span style="font-weight:bold;"><input id="CheckBox1" type="checkbox" name="CheckBox1" onclick="javascript:setTimeout('__doPostBack(\'CheckBox1\',\'\')', 0)" /><label for="CheckBox1">标记</label></span><br />
<br />
<br />
<table id="Table1" border="0" cellpadding="0" cellspacing="0" style="width: 750px; font-size: 14px; font-weight: bold;">
//此间的源码省了
</tr>
</table> <table id="tblTip" border="0" cellpadding="0" cellspacing="0" height="60" style="width: 750px; font-size: 14px;">
<tr>
<td id="tdPutong2" style="height: 1px" colspan="6" valign="bottom">
---跳过此题:点击后跳过当前的题目,全部做完后,关闭页面,再打开试卷,跳过的题目会再次出现。</td>
</tr>
<tr>
<td id="tdColor" colspan="6" style="height: 1px" valign="bottom">
---颜色含义:<font color="blue"><strong>蓝色</strong></font>表示做过的题,<strong>黑色</strong>表示未做的题。<br />
<font color="red"><strong>
红色</strong></font>表示打了标记的题(表示对此题把握不大,题目全部做完后想回头认真检查这些题目)。</td>
</tr>
<tr>
</tr>
</table> <br />
<br />
<br />
<br />
<br />
<br />
</form>
<script language ="javascript" type="text/javascript"> fnStartInterval();
</script>
</body>
</html>
我需要的数据是<td><span style="font-weight:bold;">题型:单选题 本题分数: 2分</span></td>
</tr><tr>
<td><span>2、根据企业所得税法的规定,除固定资产改建支出、大修理支出外的其他长期待摊费用摊销年限最低为( )。</span></td>
</tr><tr>
<td><table id="1446815" border="0">
<tr>
<td><input id="1446815_0" type="radio" name="1446815" value="A、3年" /><label for="1446815_0">A、3年</label></td>
</tr><tr>
<td><input id="1446815_1" type="radio" name="1446815" value="B、4年" /><label for="1446815_1">B、4年</label></td>
</tr><tr>
<td><input id="1446815_2" type="radio" name="1446815" value="C、5年" /><label for="1446815_2">C、5年</label></td>
</tr><tr>
<td><input id="1446815_3" type="radio" name="1446815" value="D、10年" /><label for="1446815_3">D、10年</label></td>
</tr>
</table></td>
</tr>
</table>
<table id="tblBtn" border="0" cellpadding="0" cellspacing="0" style="width: 750px;">
<tr>
<td colspan="6" rowspan="3">
<br />
<input type="submit" name="btnOK" value="确定" id="btnOK" class="btn" />
<input type="submit" name="btnJump" value="跳过此题" id="btnJump" class="btn" />
<span style="font-weight:bold;"><input id="CheckBox1" type="checkbox" name="CheckBox1" onclick="javascript:setTimeout('__doPostBack(\'CheckBox1\',\'\')', 0)" /><label for="CheckBox1">标记把中间的HTML的源码去除后得到一正确的练习题,如果能把按钮确定也去除最好!
Private Sub TestReg()
Dim strData As String
Dim reg As Object
Dim matchs As Object, match As Object strData = "xxx" Set reg = CreateObject("vbscript.regExp")
reg.Global = True
reg.IgnoreCase = True
reg.MultiLine = True
reg.Pattern = ">题型:[\s\S]*?id=""CheckBox1"""
Set matchs = reg.Execute(strData)
For Each match In matchs
'Debug.Print match.Value
Debug.Print match.SubMatches(0)
Next
End Sub
就得到正确的数据,但是我要的把reg.Pattern = ">题型:[\s\S]*?id=""CheckBox1"""
换成reg.Pattern = "题型:[\s\S]*?标识"那他就什么数据也提不出来,为什么?
此贴追加40分吧,感谢sysdzw!
在报纸或杂志上随便找一段约1000字的文章,在Word中输入一遍。输完后再参考下面答案:A里面有10处以上文字或标点错误
B里面没有文字或标点错误并敢为此跟人打赌
C里面没有文字或标点错误并且字体和排版完全与原稿一致
D打印在半透明的纸上和原稿重叠在一起检查一模一样,且自我感觉很有成就感A不适合编程(理由:打字准确度偏低、粗心大意)
B初级程序员(理由:打字准确度很高、认真细致、自信、理解全角半角概念)
C高级程序员(理由:在B的基础上理解字体和排版也是电脑打印的重要因素、但相比D还不够偏执、精益求精、结果可验证)
D软件项目经理(理由:能针对项目给出令人信服的细致到极点的需求说明和典型测试用例。用户几乎挑不出毛病。专业!)如果想从A变成B的话,到我的资源http://zhao4zhong1.download.csdn.net/里面下载“适合程序员的键盘练习”