例如在 www.abc.com/bbs/index.php获取了forum-38-1.html
要改成 www.abc.com/bbs/forum-38-1.html
在www.abc.com/bbs/abc/获取了 ../forum-38-1.html
要改成 www.abc.com/bbs/forum-38-1.html
在www.abc.com/bbs/abc/获取了 /index.html
要改成 www.abc.com/index.html
要改成 www.abc.com/bbs/forum-38-1.html
在www.abc.com/bbs/abc/获取了 ../forum-38-1.html
要改成 www.abc.com/bbs/forum-38-1.html
在www.abc.com/bbs/abc/获取了 /index.html
要改成 www.abc.com/index.html
要改成 www.abc.com/bbs/forum-38-1.html
{
string strHtml = "";
WebResponse wrp = null; try
{
WebRequest wrq = WebRequest.Create(strUrl);
wrq.Timeout = 60000;
wrp = wrq.GetResponse();
}
catch (WebException e)
{ }
catch (Exception e)
{ }
finally
{
if (wrp != null)
{
StreamReader sr = new StreamReader(wrp.GetResponseStream());
strHtml = sr.ReadToEnd();
sr.Close();
wrp.Close();
}
} return strHtml;
}
string[] GetLink(string strHtml)
{
Regex reg = new Regex("(?<=href[\\s\\r]*=[\\s\\r]*[\"\'])([^\"\'\\s\\r>]*)(?<=[\"\'\\s\\r]{0,1})", RegexOptions.IgnoreCase);
MatchCollection mc = reg.Matches(strHtml);
if (mc.Count > 0)
{
string[] strHref = new string[mc.Count];
int i = 0;
foreach (Match m in mc)
{
strHref[i] = m.Groups[1].Value;
++i;
}
return strHref;
}
return null;
}通过这2个函数获取的
www.abc.com/bbs/abc/../forum-38-1.html
访问效果是一样的,不知道你是做什么用
{
Regex reg = new Regex("(?<=href[\\s\\r]*=[\\s\\r]*[\"\'])([^\"\'\\s\\r>]*)(?<=[\"\'\\s\\r]{0,1})", RegexOptions.IgnoreCase); MatchCollection mc = reg.Matches(strHtml);
if (mc.Count > 0)
{
string[] strHref = new string[mc.Count];
int i = 0;
foreach (Match m in mc)
{
strHref[i] = FullLink(m.Groups[1].Value, strUrl); ++i;
}
return strHref;
}
return null;
} private string FullLink(string link, string strUrl)
{
if (!strUrl.StartsWith("http://")) strUrl = "http://" + strUrl;
string url1 = strUrl.Substring(0, strUrl.Substring(7).IndexOf("/") + 8); //url1为1级域名,即 "http://www.microsoft.com/cn" -> http://www.microsoft.com/
string url, Purl; //url为当时级别的url, 即http://www.microsoft.com/cn/ Purl为父级url 即 http://www.microsoft.com/
if (strUrl.IndexOf("/") + 1 == strUrl.LastIndexOf("/")) strUrl += "/"; Purl = url = strUrl.Substring(0, strUrl.LastIndexOf("/") + 1);
if (url.TrimEnd('/').IndexOf("/") + 1 != url.TrimEnd('/').LastIndexOf("/"))
Purl = url.Substring(0, url.TrimEnd('/').LastIndexOf("/") + 1);
if (link.StartsWith("http://")) return link;
if (link.StartsWith("/")) return url1 + link.Substring(1);
if (link.StartsWith("../")) return Purl + link.Substring(3);
return url + link; }