1.首先确定出 http://www.cccc.com/aa/bb/cc/dd/ee.htm 的根 http://www.cccc.com2.确定出 http://www.cccc.com/aa/bb/cc/dd/ee.htm 得目录 http://www.cccc.com/aa/bb/cc/dd2.在所有像 href="/、src="/ 这样的前面加入 http://www.cccc.com3.在所有 href="???.htm、src="???.jpg 前面加入 http://www.cccc.com/aa/bb/cc/dd/4.像 href="../../../df/gov.htm" 计算出有多少个 ../ ,有3个,表示从 http://www.cccc.com/aa/bb/cc/dd 后退3层,先将 http://www.cccc.com/aa/bb/cc/dd 反向排列,找到第3个 / 的位置,从这个位置开始提取字符,将提取的字符再次反向,得到http://www.cccc.com/aa/,提取 href="../../../df/gov.htm" 中的第3个 / 后面所有字符和前面得到的 http://www.cccc.com/aa/ 组合。5.像 ../ 和 ../../ 用 4 的办法同样计算。6.像 url(???/??.jpg) 只要在 url( 后面插入 http://www.cccc.com/aa/bb/cc/dd 就行了。
string pattern = @"(href\s*=\s*)|(src\s*=\s*)[""'](?<url>[^""']+)[""']";
Regex r = new Regex(pattern, RegexOptions.Compile | RegexOptions.IgnoreCase);
for(Match m = r.Match(YourHtmlPageString); m.Sucess; m = m.NextMatch())
{
string url = m.Result("${url}");
// 处理该URL
}
你的思路是不错的,我也是这样做的,但由于HTML代码的复杂性,有的属性值不含引号(单双引号),有的还可能有嵌套(特别是在javaScript中的语句处理)。
我现在差不多已经搞好了,但还有少量BUG,不过搞得很麻烦,伤神啦。现在又还要学习更多的东西,这个ASP.net,C#,组件什么的。各位:
谁有开发组件方面的经验,如何将一个dll绑定到一个目录下(端口也行),
比如:我想将http://www.lgdaily.com转为繁体,下面有许多目录,现在可不可以建立一个/big5/目录,直接将原来的:
http://www.lgdaily.com/newscontent/32213305/2003-06-10-10-06-31.asp改为:
http://www.lgdaily.com/Big5/newscontent/32213305/2003-06-10-10-06-31.asp然后就输出为繁体。全站其他网页均作类似处理而得到繁体网页。
这样不知可行否?
<%@ Page Language="VB" debug="true"%>
<%@ Import Namespace="System.Net" %>
<%@ Import Namespace="System.IO" %>
<script language="VB" runat="server">
Sub Page_load(sender as Object,E as EventArgs)
If IsPostBack=False Then
dim strUrl as string
strUrl=Request.QueryString("Url")
if strUrl="" then
strUrl=trim(Request.Params("Url"))
end if
strUrl=strUrl.TrimEnd("/")
' response.write(strUrl & "<br>")
if strUrl<>Nothing And strUrl.StartsWith("http://") then
Dim wc As New System.Net.WebClient()
Dim html As String = Encoding.default.GetString(wc.DownloadData(strUrl))
' Response.Write(html)
Dim strRegEx as String
strRegEx="\b(href|src|url|background)=((""|')?\s*([^\>\s]*?)\2?(\s)|([^>]*?>))"
html=RegExLinks(strRegEx,html,strUrl)
' strRegEx="\b(href|src|background)=(""|')?\s*([^\>\s]*?)\2?(\s)"
' html=RegExLinks(strRegEx,html,strUrl)
' strRegEx="\b(href|src|background)\s*=\s*(""|')?\s*([^>\s]*?)\2?\/?>"
' html=RegExLinks(strRegEx,html,strUrl)
Response.write(html) end if
End If
End SubFunction RegExLinks(ByVal strRegEx as string,ByVal html as string,ByVal strUrl as string)
dim arrLink() as String
dim firstquot,lastquot as string
dim strOldFullLink,strOldLink,strNewFullLink,strNewLink as String
dim strLink as String
dim strSpace as String
dim objRegEx as RegEx
Dim objMatch as Match
Dim objMatchCollection as MatchCollection
objRegEx=New RegEx(strRegEx,RegexOptions.IgnoreCase or RegexOptions.Multiline)
objMatchCollection=objRegEx.Matches(html)
For Each objMatch in objMatchCollection
strLink=objMatch.value
Erase arrLink
arrLink=strLink.split("=")
'如果链接中有http://www.domain.com/news.asp?date=200306&keyword=news&page=2等类似情况时,Ubound>=2,此时后面无空格,否则错误
if UBound(arrLink)<2 then
strSpace=" "
else
strSpace=""
end if
if arrLink(1).StartsWith("""") then
strOldFullLink=arrLink(1)
if arrLink(1).LastIndexOf("""")>1 then
if arrLink(1).EndsWith(">") then
arrLink(1)=arrLink(1).TrimEnd(">")
lastquot=""">"
else
lastquot=""""
end if
end if
strOldLink=arrLink(1).replace("""","")
firstquot=""""
strNewLink=DoLinks(strUrl,strOldLink)
strNewFullLink=firstquot & trim(strNewLink) & trim(lastquot)
' response.write("替换前:双引号" & strOldFullLink & "<br>")
' response.write("替换后:双引号<font color='red'>" & strNewFullLink & "</font><br>")
elseif arrLink(1).StartsWith("'") then
strOldFullLink=arrLink(1)
if arrLink(1).LastIndexOf("'")>1 then
if arrLink(1).EndsWith(">") then
arrLink(1)=arrLink(1).TrimEnd(">")
lastquot="'>"
else
lastquot="'"
end if
end if
strOldLink=arrLink(1).replace("'","")
firstquot="'"
strNewLink=DoLinks(strUrl,strOldLink)
strNewFullLink=firstquot & trim(strNewLink) & trim(lastquot)
' response.write("替换前:单" & strOldFullLink & "<br>")
' response.write("替换后:单<font color='red'>" & strNewFullLink & "</font><br>")
else
strOldFullLink=arrLink(0) & "=" & arrLink(1)
' strOldFullLink=arrLink(1)
strOldLink=arrLink(1)
strNewLink=DoLinks(strUrl,strOldLink)
strNewFullLink=arrLink(0) & "=" & trim(strNewLink)
' strNewFullLink=trim(strNewLink)
' response.write("前:无" & strOldFullLink & "<br>")
' response.write("后:无<font color='red'>" & strNewFullLink & "</font><br>")
end if
html=html.Replace(strOldFullLink,trim(strNewFullLink) & strSpace) firstquot=nothing
lastquot=nothing
strOldFullLink=nothing
strNewFullLink=nothing
Next
RegExLinks=html
End FunctionFunction DoLinks(byVal strUrl as string,byVal strTempLink as string)
dim objRegExSite as RegEx
objRegExSite=New RegEx("http://[^/]+",RegexOptions.IgnoreCase)
dim strSite as string
strSite=trim(objRegExSite.Match(strUrl).value.ToString)
dim strLinkF as String
dim strUrlF as String
strUrlF=strUrl.Replace(strSite,"")
dim arrDir() as String
dim iDirLen as integer
if strUrlF.indexOf("/")>=0 then
arrDir=strUrlF.split("/")
iDirLen=arrDir.length
strUrlF=strUrlF.Replace(arrDir(iDirLen-1),"")
end if dim k,j as Integer
dim objMatchColF as MatchCollection
dim objRegExF as RegEx
if strTempLink.ToLower.StartsWith("javascript:") or strTempLink.ToLower.StartsWith("mailto:") or strTempLink.ToLower.StartsWith("#") or _
strTempLink.ToLower.StartsWith("http://") or strTempLink.ToLower.StartsWith("www.") then
strLinkF=strTempLink
elseif strTempLink.StartsWith("../") then
objRegExF=New RegEx("\.\.\/")
objMatchColF=objRegExF.Matches(strTempLink)
j=objMatchColF.Count
'当下载网页链接的"../"个数+1大于该网页链接层数时,说明网页本身有误,则指向最底层链接。
if isArray(arrDir) then
if Ubound(arrDir)<j+1 then
j=Ubound(arrDir)-1
end if
for k=j-1 to 0 step -1
strUrlF=trim(strUrlF.Remove(strUrlF.LastIndexOf(arrDir(iDirLen-2-k)),len(arrDir(iDirLen-2-k))+1))
next
end if
dim strEnd as String
strEnd=trim(strTempLink.Replace("../",""))
strLinkF=strSite.subString(0,len(strSite)) & strUrlF & strEnd
elseif strTempLink.StartsWith("./") then
' http://www.southcn.com/news/china
' ./todaycn/200306260529.htm
strLinkF=strUrl & strTempLink.Replace("./","/")
elseif strTempLink.StartsWith("/") then
strLinkF=strSite & strTempLink
else
if strUrlF="" then
strUrlF="/"
end if
strLinkF=strSite & strUrlF & strTempLink
end if
DoLinks=strLinkF
End Function</script>
<html>
<body>
</body>