我正在做一个投标管理系统
比如:下面是一个招标网
http://www.szft.gov.cn/zfbm/zfcg/zhbgg/index.jsp
点击进去是详细内容现在我要抓取里面有用的招标信息,要如何抓取呀。
请各位网友提供一个思路呀。谢了。
比如:下面是一个招标网
http://www.szft.gov.cn/zfbm/zfcg/zhbgg/index.jsp
点击进去是详细内容现在我要抓取里面有用的招标信息,要如何抓取呀。
请各位网友提供一个思路呀。谢了。
{
Regex reg = new Regex( strReg, RegexOptions.IgnoreCase);
Match m = reg.Match( str );
return m.Groups[strProperty].Value;
/*
MatchCollection mc = reg.Matches(str);
foreach(Match m in mc )
{
string name = m.Groups[strProperty].Value;
}
return string.Empty;
*/
} private string GrabUrl( string url )
{
WebClient wc = new WebClient();
Stream s = null;
try
{
s = wc.OpenRead(url);
StreamReader sr = new StreamReader( s, System.Text.Encoding.Default);
return sr.ReadToEnd();
}
catch
{
return string.Empty;
}
finally
{
if( s != null )
s.Close();
wc.Dispose();
}
} const string categoryUrl = "http://www.lijiabaobei.com/category.asp?cid={0}";
const string subcategoryUrl = "http://www.lijiabaobei.com/scategory.asp?cid={0}";
const string subcategoryPageUrl = "http://www.lijiabaobei.com/scategory.asp?page={0}&cid={1}";
const string productUrl = "http://www.lijiabaobei.com/product.asp?cid={0}"; const string subcategoryReg = "<td valign=\"bottom\" bgcolor=\"E7E7E7\" style=\"padding-top:4px;padding-left:8px\">·<a href=\"scategory.asp\\?cid=(?<subcategoryid>.*?)\">(?<name>.*?)</a></td>"; const string subcategoryPage = "第<strong><font color=red>1</font>/(?<pagecount>.*?)</strong>页";
const string productLink = "<td width=\"293\" valign=\"bottom\" class=\"large\" style=\"padding-top:12px\"><a class=\"nav_sp_title_l\" href=\"product.asp\\?cid=(?<id>.*?)\">(?<name>.*?)</a></td>";
private void Button1_Click(object sender, System.EventArgs e)
{
/*
using (SqlDataReader dr = lsy.GetAllCategory())
{
while(dr.Read())
{
Regex reg = new Regex( subcategoryReg, RegexOptions.IgnoreCase );
string html = GrabUrl( string.Format( categoryUrl, dr["categoryid"] ) );
MatchCollection mc = reg.Matches( html ); for( int i = 0; i < mc.Count; i++ )
{
Match m = mc[i]; lsy.AddSubCategory( Int32.Parse(dr["categoryid"].ToString()),
Int32.Parse(m.Groups["subcategoryid"].Value.Trim()),
m.Groups["name"].Value.Trim()
);
}
}
}
*/ using ( SqlDataReader dr = lsy.GetAllSubCategory())
{
while(dr.Read())
{
int subcategoryid = Int32.Parse(dr["subcategoryid"].ToString() );
string html = GrabUrl( string.Format( subcategoryUrl, subcategoryid ) ); Regex regPage = new Regex(subcategoryPage, RegexOptions.IgnoreCase );
Match mp = regPage.Match(html); int pageCount = Convertor.ToInt32(mp.Groups["pagecount"].Value.Trim()); /* ParseOnePage( html, subcategoryid ); for( int i = 1; i < pageCount; i++ )
{
ParseOnePage( GrabUrl( string.Format( subcategoryPageUrl, i, subcategoryid ) ), subcategoryid);
}
*/ for( int i = 2; i <= pageCount; i++ )
ParseOnePage( GrabUrl( string.Format( subcategoryPageUrl, i, subcategoryid ) ), subcategoryid); }
}
} private void ParseOnePage(string html, int subcategoryid )
{
//获取子项
Regex reg = new Regex( productLink, RegexOptions.IgnoreCase );
MatchCollection mc = reg.Matches( html ); for( int i = 0; i < mc.Count; i++ )
{
Match m = mc[i]; int id = Int32.Parse(m.Groups["id"].Value.Trim()); AddProduct(id, subcategoryid);
}
} const string regProducingArea = "<br>\r\n ·产地/(?<producingarea>.*?) <br></td>\r\n";
const string regAge = "<span class=\"style11\">适用月(年)龄:</span><font color class=\"color_sh_title\">(?<age>.*?)</font>";
const string regIntroduction = "<font color class=\"color_sh_text\">★(?<introduction>.*?)</font></td>\r\n";
const string regRe = "<td style=\"padding:10px 5px 10px 13px\">(?<re>.*?)\r\n</td>\r\n\t\t\t\t\t\t</tr>\r\n\t </table>";
const string regRow = "<td>(?<number>.*?)</td>\r\n\t\t\t\t\t\t\t\t<td>(?<name>.*?)</td>\r\n\t\t\t\t\t\t\t\t<td>(?<spec>.*?)</td>\r\n\t\t\t\t\t\t\t\t<td>¥(?<etprice>.*?)</td>\r\n\t\t\t\t\t\t\t\t<td><font color class=\"color1\">¥(?<vipprice>.*?)</font></td>";
const string regImage = "<img src=\"(?<image>.*?)\" width=\"200\" height=\"200\" border=\"0\" class=\"box_juti\"></a><br>\r\n";
const string regSpec = "<td colspan=\"2\" valign=\"top\" style=\"padding-top:10px;padding-bottom:20px\">\r\n ·规格/(?<spec>.*?)\r\n"; private void AddProduct( int id, int subcategoryid )
{
string htmlProduct = GrabUrl( string.Format( productUrl, id ) ); string name;
string spec;
string number;
decimal etprice;
decimal vipprice; GetRowsProperty( htmlProduct, out number, out name, out spec, out etprice, out vipprice ); string producingarea = GetProperty(htmlProduct, regProducingArea, "producingarea");
string age = GetProperty(htmlProduct, regAge, "age");
string introduction = GetProperty(htmlProduct, regIntroduction, "introduction");
string re = GetProperty(htmlProduct, regRe, "re");
string image = GetProperty(htmlProduct, regImage, "image");
spec = GetProperty(htmlProduct, regSpec, "spec" ); lsy.AddProduct(id, subcategoryid,
name,
number,
spec,
producingarea,
age,
introduction,
re,
image,
etprice,
vipprice);
}