//StringBuilder sb = new StringBuilder(); .*)-( Regex re = new Regex(@"<[a|img|script|link]\s+[href|src]+=([""'])(?<url>[^'"">]+)\1\s*>(\s*(?<name>[^-\s|(<img\S\s</)]*)\s*-)?\s*(?<song>[^<]+)</", RegexOptions.IgnoreCase | RegexOptions.Singleline); MatchCollection mc = re.Matches(PageStr); foreach (Match m in mc) { if (m.Groups["name"].Value != "") this.strFileList.AppendText(m.Groups["name"].Value); else if (m.Groups["name"].Captures.Count > 0)
Regex re = new Regex(@"<[a|img|script|link]\s+[href|src]+=([""'])(?<url>[^'"">]+)\1\s*>(\s*(?<name>[^-\s|(<img\S\s</)]*)\s*-)?\s*(?<song>[^<]+)</", RegexOptions.IgnoreCase | RegexOptions.Singleline); 用正则式匹配网页内网<url>匹配URL地址<name>匹配键接文本 只不过这个是下载页面的每一个文档保存
http://www.knowsky.com/4234.html
#region 得到网络流
/// <summary>
/// 返回获取的网络流
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public static string getPage(String url)
{
WebResponse result = null;
System.Text.StringBuilder txthtml=new StringBuilder();
try
{
WebRequest req = WebRequest.Create(url);
result = req.GetResponse();
Stream ReceiveStream = result.GetResponseStream();
Encoding encode = System.Text.Encoding.GetEncoding("gb2312");
StreamReader sr = new StreamReader(ReceiveStream,encode);
//this.Message.Text+="\r\n已接收到响应流";
if (true)
{
Char[] read = new Char[256];
int count = sr.Read( read, 0, 256 ); // this.Message.Text+="HTML...\r\n";
while (count > 0)
{
String str = new String(read, 0, count);
txthtml.Append(str);
count = sr.Read(read, 0, 256);
}
// this.Message.Text="":
}
}
catch(Exception)
{
txthtml.Append("err");
//this.is_id.Value=url;
}
finally
{
if (result != null)
{
result.Close();
}
}
//this.is_id.Value=url;
return txthtml.ToString();
} #endregion #region 获取根路径
public static string GetUrlRootPath(string Url)
{
Url=Url.Replace("http://","");
//ArrayList pathParts = new ArrayList();
string[] p = Url.Split('/');
return p[0];
} #endregion #region 处理带..的URL路径 public static string GetAndNormailizePath(string path)
{
string url=path.Replace("http://","");
string m_CurrentDir="";
ArrayList pathParts = new ArrayList();
string[] p = url.Split('/');
pathParts.AddRange(p);
for(int i=0;i<pathParts.Count;i++)
{
if(pathParts[i].ToString() == "..")
{
if(i > 0)
{
pathParts.RemoveAt(i - 1);
i--;
} pathParts.RemoveAt(i);
i--;
}
} for(int i=0;i<pathParts.Count ;i++)
{
m_CurrentDir += pathParts[i] + "/";
}
return String.Format("http://{0}",m_CurrentDir.Remove(m_CurrentDir.Length-1,1)) ;//"http://"+m_CurrentDir.Remove(m_CurrentDir.Length-1,1);
}
#endregion
void Start()
{
strFileList.AppendText("主线程启动.......\n");
string strURL=this.UrlAddress.Text.Trim();
string PageStr=Core.getPage(strURL);
Clist UrlList =new Clist();
//StringBuilder sb = new StringBuilder(); .*)-(
Regex re = new Regex(@"<[a|img|script|link]\s+[href|src]+=([""'])(?<url>[^'"">]+)\1\s*>(\s*(?<name>[^-\s|(<img\S\s</)]*)\s*-)?\s*(?<song>[^<]+)</", RegexOptions.IgnoreCase | RegexOptions.Singleline);
MatchCollection mc = re.Matches(PageStr);
foreach (Match m in mc)
{
if (m.Groups["name"].Value != "")
this.strFileList.AppendText(m.Groups["name"].Value);
else if (m.Groups["name"].Captures.Count > 0)
this.strFileList.AppendText("未知");
else
strFileList.AppendText("文件名");
strFileList.AppendText("\t");
strFileList.AppendText(m.Groups["song"].Value);
strFileList.AppendText("\t");
strFileList.AppendText(m.Groups["url"].Value);
UrlList.Append(m.Groups["url"].Value);
strFileList.AppendText("\n");
}
strFileList.AppendText("遍历URL树:\n");
UrlList.MoveFrist();
string url=string.Empty;
while(!UrlList.IsEof())
{
url=UrlList.GetCurrentValue().ToString();
//strFileList.AppendText(UrlList.GetCurrentValue()+"\n");
//GetDirPath(url);
//strFileList.AppendText();
if(url.IndexOf("/")==-1)
{
url=this.UrlAddress.Text+"/"+url;
}
if(url.StartsWith("/"))
{
url="http://"+Core.GetUrlRootPath(this.UrlAddress.Text)+"/"+url;
}
if(url.StartsWith("."))
{
url=Core.GetAndNormailizePath(url);
}
this.Down(url,GetDirPath(url));
UrlList.MoveNext();
}
System.Windows.Forms.MessageBox.Show("文件下载完成!");
}
public void Down(string StrUrl,string FilePath)
{
long lStartPos =0; int iNumber = StrUrl.LastIndexOf("/");
string strFileName = StrUrl.Substring(iNumber+1,StrUrl.Length - (iNumber + 1));
string StrFileName=FilePath+"\\"+strFileName;
if(strFileName=="")
{
System.IO.Directory.CreateDirectory(StrFileName);
return;
}
System.IO.FileStream fs;
StrFileName=StrFileName.Replace("?","问号");
//打开上次下载的文件或新建文件
if (System.IO.File.Exists(StrFileName)) { fs= System.IO.File.OpenWrite(StrFileName); lStartPos=fs.Length; fs.Seek(lStartPos,System.IO.SeekOrigin.Current); //移动文件流中的当前指针 } else { fs = new System.IO.FileStream(StrFileName,System.IO.FileMode.Create); lStartPos =0; } //打开网络连接 try { System.Net.HttpWebRequest request =(System.Net.HttpWebRequest)System.Net.HttpWebRequest.Create(StrUrl); if ( lStartPos>0) request.AddRange((int)lStartPos); //设置Range值 //向服务器请求,获得服务器回应数据流 System.IO.Stream ns= request.GetResponse().GetResponseStream(); byte[] nbytes = new byte[512]; int nReadSize=0; nReadSize=ns.Read(nbytes,0,512); while( nReadSize >0) { fs.Write(nbytes,0,nReadSize); nReadSize=ns.Read(nbytes,0,512); } fs.Close(); ns.Close(); this.strFileList.AppendText("下载"+StrUrl+"完成!\n"); } catch(Exception ex) { fs.Close(); this.strFileList.AppendText("下载过程中出现错误:"+ex.ToString()); }
} public string GetDirPath(string URL)
{
string RootPath=this.FilePathStr.Text;
int c=0;
if(URL.ToLower().StartsWith("http://"))
{
c=URL.Replace("http://","").IndexOf("/")+7;
}
if(URL.ToLower().StartsWith("ftp://"))
{
c=URL.Replace("ftp://","").IndexOf("/")+6;
}
string path =RootPath+URL.Substring(c,URL.Length-c-(URL.Length-URL.LastIndexOf("/"))).Replace("/","\\");
if(!System.IO.Directory.Exists(path))
{
System.IO.Directory.CreateDirectory(path);
this.strFileList.AppendText("创建目录:"+path+"完成!\n");
}
return path;
}
#endregion
孟子de
只不过这个是下载页面的每一个文档保存
可能跟你说的没关系,报谦!
先谢谢了
建立正则表达式匹配规则 匹配@"<[a|img|script|link] a,img ,script ,link标签
(?<url>[^'"">]+)\1\s*>(\s*(? 匹配URL地址 后面一个匹配文本内容
RegexOptions.IgnoreCase 指定不区分大小写
RegexOptions.Singleline 指定单行匹配
MatchCollection mc = re.Matches(PageStr); 匹配的内容放入集合
foreach (Match m in mc)
取出集合的内容
if request("submit")<>"" or request("domain")<>"" then
url="http://www.alexa.com/data/details/traffic_details?q=&url="&replace(request("domain"),"http://","")
getsms=gethttppage(url)
if instr(getsms,">Traffic Rank for") then
star=instr(getsms,">Traffic Rank for")+206
endd=instr(star,getsms,"</table>")+8
getstr=mid(getsms,star,endd-star)
getstr=replace(getstr,">Today<",">今日排名<")
getstr=replace(getstr,"1 wk. Avg.","一周平均")
getstr=replace(getstr,"3 mos. Avg.","三月平均")
getstr=replace(getstr,"3 mos. Change","三月浮动")
getstr=replace(getstr,"#CCCC99","#efefef")
getstr=replace(getstr,"<span class=""body"">","<font size=""-1"">")
response.write getstr
picurl="http://traffic.alexa.com/graph?w=200&h=100&r=6m&u="&request("domain")&"/&u="
response.write "<br> <a href =""http://www.alexa.com/data/details/?url="&request("domain")&""" target=""_blank"">点击察看alexa该网址说明</a><td width=""50%""><img src="&picurl&">"
end if
else response.write "您可能输入错误,请从<a href =""http://www.tmxk.net/info"">www.tmxk.net/info</a>重新搜索<br>"
end if
%>
我的目的和这个asp代码差不多,就是找代码中的一部分,定位过去,取得需要的字符传