<div id="stylebags_pro_model">Model: M45715</div><!--bof Product Price block --><h2 id="productPrices" class="productGeneral">$219.00</h2><!--eof Product Price block --><!--bof free ship icon -->
<!--eof free ship icon --><br class="clearBoth" /><div id="needHelp">
用正则获取其中的
Model: M45715 和 $219.00等牛人来了 和前一贴 一起结
<!--eof free ship icon --><br class="clearBoth" /><div id="needHelp">
用正则获取其中的
Model: M45715 和 $219.00等牛人来了 和前一贴 一起结
比如提取 Model: M45715 根据什么规则?
是<div id="stylebags_pro_model"> 至</div>之间的数据还是其他ID? ID是否有限制?
string str="我上面的代码";
从这个str中 取出Model: M45715 和 $219.00这两个值
要求是正则
StreamReader sr = new StreamReader(@"C:\Documents and Settings\Administrator\桌面\Test.txt", Encoding.GetEncoding("GB2312"));
string tempStr = sr.ReadToEnd();
string pattern = @"[\s\S]*<div[^>]+stylebags_pro_model[^>]*>([^<]+)</div>[\s\S]+<h2[^>]+productPrices[^>]*>([^<]+)</h2>[\s\S]*";
string tempStr1 = Regex.Replace(tempStr, pattern, "$1");//输出:Model: M45715
string tempStr2 = Regex.Replace(tempStr, pattern, "$2");//输出:$219.00
void Main()
{
string str = @"<div id=""stylebags_pro_model"">Model: M45715</div><!--bof Product Price block --><h2 id=""productPrices"" class=""productGeneral"">$219.00</h2><!--eof Product Price block --><!--bof free ship icon -->
<!--eof free ship icon --><br class=""clearBoth"" /><div id=""needHelp"">
";
foreach(Match m in Regex.Matches(str,@"(?is)<div[^>]*?>([^<]+)</div>.*?<h2[^>]*?>([^<]+)</h2>"))
{
Console.WriteLine("{0}\t{1}",m.Groups[1].Value.Trim(),m.Groups[2].Value.Trim());
}
/*
Model: M45715 $219.00
*/
}
stylebags_pro_model">Model: M45715</div><!--bof Product Price block --><h2 id="productPrices" class="productGeneral">$219.00</h2><!--eof Product Price block --><!--bof free ship icon -->
<!--eof free ship icon --><br class="clearBoth" /><div id="
现在编程这样了
using System.Data;
using System.Configuration;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.HtmlControls;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using MSXML2;/// <summary>
///so 的摘要说明
/// </summary>
public class so
{
public so()
{
//
//TODO: 在此处添加构造函数逻辑
//
} public string getHtml(string url, string charSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
{
WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
// 需要注意的:
//有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
//这是就要具体问题具体分析比如在头部加入cookie
// webclient.Headers.Add("Cookie", cookie);
//这样可能需要一些重载方法。根据需要写就可以了
//myWebClient.Headers.Add("User-agent", "Mozilla/5.0+(compatible;+YandexBot/3.0;++http://yandex.com/bots)");
//获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
myWebClient.Credentials = CredentialCache.DefaultCredentials;
//如果服务器要验证用户名,密码
//NetworkCredential mycred = new NetworkCredential(struser, strpassword);
//myWebClient.Credentials = mycred;
//从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
byte[] myDataBuffer = myWebClient.DownloadData(url);
string strWebData = Encoding.Default.GetString(myDataBuffer); //获取网页字符编码描述信息
Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
string webCharSet = charSetMatch.Groups[2].Value;
if (charSet == null || charSet == "")
charSet = webCharSet; if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
return strWebData;
} //截取多个内容 public string GetSubHtmls(string str, string s, string e)
{
string resString = ""; //存放提取的结果 Regex reg1 = new Regex("" + s + "(.*?)" + e + "", RegexOptions.Singleline | RegexOptions.IgnoreCase);
MatchCollection matches = reg1.Matches(str);
foreach (Match match in matches)
{
resString += match.Groups[1].Value.Trim() + "[email protected]";
}
return resString;
} public string GetSubHtmlsWithOutTag(string str, string s, string e)
{
string resString = ""; //存放提取的结果 Regex reg1 = new Regex("" + s + "(.*?)" + e + "", RegexOptions.Singleline | RegexOptions.IgnoreCase);
MatchCollection matches = reg1.Matches(str);
foreach (Match match in matches)
{
resString += match.Groups[1].Value + "[email protected]";
}
return this.GetText(resString);
} //截取函数 public string GetSubHtmlwithOutTag(string str, string s, string e)
{
Regex rg = new Regex("(?<=(" + s + "))[.\\s\\S]*?(?=(" + e + "))", RegexOptions.Multiline | RegexOptions.Singleline);
return GetText(rg.Match(str).Value);
} public string GetSubHtml(string str, string s, string e)
{
Regex rg = new Regex("(?<=(" + s + "))[.\\s\\S]*?(?=(" + e + "))", RegexOptions.Multiline | RegexOptions.Singleline);
return rg.Match(str).Value;
} //取得网页的文本(去掉CSS HTML JavaScript脚本等)
public String GetText(String strTemp)
{
strTemp = System.Text.RegularExpressions.Regex.Replace(strTemp, "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>", "");
string str2Temp = System.Text.RegularExpressions.Regex.Replace(strTemp, "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>", "");
string str3Temp = System.Text.RegularExpressions.Regex.Replace(str2Temp, "<[^>]+>", "");
//return System.Text.RegularExpressions.Regex.Replace(str3Temp, "", "");
return str3Temp;
} //获取网页源码
public string funGetBody(string priStrUrl, string priStrMothed)
{
string HtmlCode = "";
try
{
MSXML2.XMLHTTP Retrieval = new MSXML2.XMLHTTP();
Retrieval.open(priStrMothed, priStrUrl, false, "", "");
Retrieval.send(null);
Byte[] b = (Byte[])Retrieval.responseBody;
HtmlCode = System.Text.ASCIIEncoding.GetEncoding("gb2312").GetString(b, 0, b.Length);
}
catch (Exception e)
{
e.ToString();
return "error";
}
return HtmlCode; }
}就这样吧
不要Model 嘿嘿
StreamReader sr = new StreamReader(@"C:\Documents and Settings\Administrator\桌面\Test.txt", Encoding.GetEncoding("GB2312"));
string tempStr = sr.ReadToEnd();
string pattern = @"[\s\S]*stylebags_pro_model[^>]*>[^:]+:([^<]+)</div>[\s\S]+<h2[^>]+productPrices[^>]*>([^<]+)</h2>[\s\S]*";
string tempStr1 = Regex.Replace(tempStr, pattern, "$1");//输出: M45715
string tempStr2 = Regex.Replace(tempStr, pattern, "$2");//输出:$219.00
void Main()
{
string str = @"<div id=""stylebags_pro_model"">Model: M45715</div><!--bof Product Price block --><h2 id=""productPrices"" class=""productGeneral"">$219.00</h2><!--eof Product Price block --><!--bof free ship icon -->
<!--eof free ship icon --><br class=""clearBoth"" /><div id=""needHelp"">
";
foreach(Match m in Regex.Matches(str,@"(?<=>)[^<>]+(?=<)"))
{
Console.WriteLine(m.Value.Trim().Contains(":")?m.Value.Trim().Split(':')[1]:m.Value.Trim());
}
/*
M45715 $219.00 */
}
和q107770540
的帮忙 我把你们的综合了下解决了
结贴