string s = readUrlHTML(@"http://shop36459303.taobao.com"); Regex r = new Regex("(?<=<a class=\"hCard fn\" .*>).*(?=</a>)"); MatchCollection math = r.Matches(s); foreach (Match m in math) { string result = m.ToString(); //tv在线购物专柜 }
string s = "<a class=\"hCard fn\" href=\"http://alanfurniture.taobao.com\">艾伦家具</a>"; Regex r = new Regex("(?<=<a class=\"hCard fn\" .*>).*(?=</a>)"); MatchCollection math = r.Matches(s); foreach (Match m in math) { string result = m.ToString(); //艾伦家具 }
string s = readUrlHTML(@"http://shop36459303.taobao.com");
string result = s.Substring(s.IndexOf("<title>"), s.IndexOf("</title>") - s.IndexOf("<title>")).Replace("<title>", "").Replace("\t", "").Replace("\r", "").Replace("\n", "");
//结果:首页-桑德大卖场-淘宝网
//后边的截取我就不写了,你懂的 /// <summary>
/// 读取URL
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
private System.String readUrlHTML(System.String url)
{
System.Net.HttpWebRequest request = null;
System.Net.HttpWebResponse response = null;
System.String innerHTML = "";
try
{
request = (System.Net.HttpWebRequest)System.Net.HttpWebRequest.Create(url);
request.Method = "get";
response = (System.Net.HttpWebResponse)request.GetResponse();
System.Text.Encoding encode = null;
try
{
encode = System.Text.Encoding.GetEncoding(response.CharacterSet);
}
catch
{
encode = System.Text.Encoding.Default;
}
System.IO.Stream stream = response.GetResponseStream();
System.IO.StreamReader read = new System.IO.StreamReader(stream, encode);
innerHTML = read.ReadToEnd();
response.Close();
}
catch
{
innerHTML = "";
}
return innerHTML;
}
{
byte[] buf = new WebClient().DownloadData(url);
if (encoding != null) return encoding.GetString(buf);
string html = Encoding.UTF8.GetString(buf);
encoding = GetEncoding(html);
if (encoding == null || encoding == Encoding.UTF8) return html;
return encoding.GetString(buf);
}
static string GetTitle(string html)
{
string pattern = @"(?si)<title(?:\s+(?:""[^""]*""|'[^']*'|[^""'>])*)?>(?<title>.*?)</title>";
return Regex.Match(html, pattern).Groups["title"].Value.Trim();
}
foreach (Match m in mc)
{
Response.Write(m.Groups[ "url "].Value);
Response.Write(m.Groups[ "text "].Value);
}
Regex r = new Regex("(?<=<a class=\"hCard fn\" .*>).*(?=</a>)");
MatchCollection math = r.Matches(s);
foreach (Match m in math)
{
string result = m.ToString(); //tv在线购物专柜
}
Regex r = new Regex("(?<=<a class=\"hCard fn\" .*>).*(?=</a>)");
MatchCollection math = r.Matches(s);
foreach (Match m in math)
{
string result = m.ToString(); //艾伦家具
}
是一个页面只有一个<a class="hCard fn" href="http://alanfurniture.taobao.com">艾伦家具</a> 我在文本框输入http://alanfurniture.taobao.com 下面的文本框会自动提取 艾伦家具这4个字
不论我输入什么地址 所有的页面都是有个共同的<a class="hCard fn" ,后面的href不一定就是文本框输入的地址 ,怎么用正则表达式查询阿?