面临人才网的采集程序,请大家帮忙 没做过这类的程序,不知道怎么下手,全是一个人做,请做过类似的高手,可以提供点相关的资料或程序代码,本人在这里谢谢高手,如果能提供整个方案,本人可以提供经济,但是数目有限,希望大家体谅,本人也是打工的,没赚多少钱,因为项目比较急,又是我一个人做,才出此下策,请高手帮忙 解决方案 » 免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货 也在学习中有一个简单的例子 private void button1_Click(object sender, EventArgs e) { string url = textBox1.Text; string str = DownHtml(url, System.Text.Encoding.GetEncoding("gb2312")); Regex re = new Regex("(\\d+-)?(\\d{4}-?\\d{7}|\\d{3}-?\\d{8}|^\\d{7,8})(-\\d+)?"); Match m = re.Match(str); int i=0; while (m.Success) { dataGridView1.Rows.Add(); dataGridView1.Rows[i++].Cells[0].Value = m.Value; m = m.NextMatch(); } MessageBox.Show("下载成功!!"); } /// <summary> /// 下载网页 /// </summary> /// <param name="Url">网址</param> /// <param name="myEncoding">编码</param> /// <returns>网页内容</returns> public static string DownHtml(string Url, Encoding myEncoding) { try { HttpWebRequest loHttp = (HttpWebRequest)WebRequest.Create(Url); loHttp.Timeout = 500; HttpWebResponse loWebResponse = (HttpWebResponse)loHttp.GetResponse(); StreamReader loResponseStream = new StreamReader(loWebResponse.GetResponseStream(), myEncoding); string html = loResponseStream.ReadToEnd(); loWebResponse.Close(); return html; } catch { return null; } } private void button2_Click(object sender, EventArgs e) { string BoardStream = DownHtml(textBox1 .Text , System.Text.Encoding.GetEncoding("gb2312")); StreamWriter saveAPage = new StreamWriter(Application .StartupPath +"\\aa.html", false, System.Text.Encoding.GetEncoding("gb2312"));//实例化写入类,保存路径假设为C:\a.html saveAPage.Write(BoardStream);//创建写入任务 saveAPage.Flush();//写入文件(即清理缓存流) saveAPage.Close();//关闭写入类的对象 MessageBox.Show("成功下载!!!"); // File.Open(Application.StartupPath + "\\aa.html", FileMode.Open, FileAccess.Read, FileShare.Read); System.Diagnostics.Process.Start(Application.StartupPath + "\\aa.html"); } WebClientWebRequest正则表达式 采集也就是从其他人才网采集信息吧,有过类似的代码采集csdn的 http://www.cnblogs.com/hfzsjz/archive/2010/01/26/1656720.htmlhttp://www.cnblogs.com/hfzsjz/archive/2010/01/26/1656722.htmlhttp://download.csdn.net/source/1795639 http://download.csdn.net/source/1795639CSDN的这个资源下载好吗?不知道好不好用,资源分不多了…… 以前写的一个获取51job的一个Demopublic class Search51job { public int Total = 0; string Curr_page = null; public Search51job() { Get1stPage(); } string Get1stPage() { Uri uri = new Uri("http://search.51job.com/jobsearch/search_result.php"); HttpWebRequest wrq = WebRequest.Create(uri) as HttpWebRequest; string postdata = "keywordtype=2&jobarea=&funtype=0100&industrytype=01&stype=1&searchname=&fromType=1&keyword=.net&issuedate=2&workyear=&providesalary=&cotype=°reefrom=&jobterm="; byte[] data = Encoding.GetEncoding("gb2312").GetBytes(postdata); wrq.Method = "POST"; wrq.ContentType = "application/x-www-form-urlencoded"; wrq.ContentLength = data.Length; wrq.CookieContainer = new CookieContainer(); wrq.CookieContainer.Add(uri, new Cookie("guid", "1225348962627790079")); wrq.CookieContainer.Add(uri, new Cookie("51job", "cenglish%3D0")); wrq.Referer = "http://search.51job.com/jobsearch/advance_search.php"; Stream req = wrq.GetRequestStream(); req.Write(data, 0, data.Length); req.Close(); HttpWebResponse wrp = wrq.GetResponse() as HttpWebResponse; Stream resp = wrp.GetResponseStream(); StreamReader sr = new StreamReader(resp, Encoding.GetEncoding("gb2312")); string page = sr.ReadToEnd(); if (Curr_page == null) Curr_page = page; sr.Close(); resp.Close(); string pat = "约有\\s{1,}[^0-9]*([0-9]+)[^0-9]*\\s{1,}项符合条件的查询结果"; Match m = Regex.Match(page, pat, RegexOptions.IgnoreCase); Total = (int)Math.Ceiling(double.Parse(m.Result("$1")) / 30); return page; } string GetPage(int i) { if (i == 1) { return Get1stPage(); } if (Curr_page == null) Get1stPage(); Uri uri = new Uri("http://search.51job.com/jobsearch/search_result.php"); HttpWebRequest wrq = WebRequest.Create(uri) as HttpWebRequest; string postdata = getPostdata(i); byte[] data = Encoding.GetEncoding("gb2312").GetBytes(postdata); wrq.Method = "POST"; wrq.ContentType = "application/x-www-form-urlencoded"; wrq.ContentLength = data.Length; wrq.CookieContainer = new CookieContainer(); wrq.CookieContainer.Add(uri, new Cookie("guid", "1225348962627790079")); wrq.CookieContainer.Add(uri, new Cookie("51job", "cenglish%3D0")); wrq.Referer = "http://search.51job.com/jobsearch/search_result.php"; Stream req = wrq.GetRequestStream(); req.Write(data, 0, data.Length); req.Close(); HttpWebResponse wrp = wrq.GetResponse() as HttpWebResponse; Stream resp = wrp.GetResponseStream(); StreamReader sr = new StreamReader(resp, Encoding.GetEncoding("gb2312")); string page = sr.ReadToEnd(); if (Curr_page == null) Curr_page = page; sr.Close(); resp.Close(); return page; } string getPostdata(int n) { string[] names = { "postchannel", "stype", "district", "district", "funtype_big", "funtype", "industrytype", "issuedate", "keywordtype", "dis_keyword", "keyword", "workyear", "providesalary", "cotype", "degreefrom", "jobterm", "ord_field", "list_type", "last_list_type", "curr_page", "last_page", "nStart", "start_page", "total_page", "jobid_list", "jobid_count", "schTime", "statCount", "statData", "fromType" }; string post = ""; for (int i = 0; i < names.Length; i++) { if (names[i] == "fromType") post += names[i] + "=14&"; else if (names[i] == "curr_page") post += names[i] + "=" + n + "&"; else post += names[i] + "=" + getFieldValue(names[i]) + "&"; } return post.Substring(0, post.Length - 1); } string getFieldValue(string name) { // string pat = "<input\\s{1,}type=\"hidden\"\\s{1,}name=\""+name+"\"\\s{1,}value=\"([0-9a-zA-Z~\\|\\.]*)\">"; string pat = "<input\\s{1,}type=\"hidden\"\\s{1,}name=\"" + name + "\"\\s{1,}value=\"([^\"]*)\">"; Match match = Regex.Match(Curr_page, pat); string value = match.Result("$1"); return value; } public void GetPosition(int n) { string page = GetPage(n); string jobpat = "<a\\s{1,}href=\"\\/jobsearch\\/show_job_detail\\.php\\?id=\\(([0-9]+)\\)\"\\s{1,}onclick=\"javascript:OJDL\\([0-9]+\\)\"\\s{1,}target=\"_blank\"\\s{1,}class=\"jobname\"\\s{1}>([\\w\\W]+?)<\\/a>"; string companypat = "<a\\s{1,}href=\"\\/jobsearch\\/co_all_job\\.php\\?coid=\\(\\d{1,}\\)\"\\s{1,}target=\"_blank\"\\s{1,}class=\"coname\"\\s{1,}>([\\w\\W]+?)<\\/a>"; Match match = Regex.Match(page, jobpat, RegexOptions.IgnoreCase); Match match1 = Regex.Match(page, companypat, RegexOptions.IgnoreCase); while (match.Success && match1.Success) { string jobid = match.Result("$1"); string jobname = match.Result("$2"); string company = match1.Result("$1"); string email, description; getcompdetail(jobid, out email, out description); BusiEntry.Position p = new BusiEntry.Position(); p.jobname = jobname; p.email = email; p.company = company; p.description = description; Dal.SavePosition save = new Dal.SavePosition(); save.SavePos(p); match = match.NextMatch(); match1 = match1.NextMatch(); } } void getcompdetail(string id, out string email, out string desc) { string url = "http://search.51job.com/jobsearch/show_job_detail.php?id=(" + id + ")"; string page = getdetailpage(url); string emailpat = @"\w+([-+.']\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*"; Regex reg = new Regex(emailpat, RegexOptions.IgnoreCase); Match m = reg.Match(page); List<string> list = new List<string>(); while (m.Success) { string str = m.ToString(); if ((!list.Contains(str)) && (str.IndexOf("@51job.com")==-1)) list.Add(str); m = m.NextMatch(); } email = ""; for (int i = 0; i < list.Count; i++) email += list[i] + ","; if (email.Length > 0) email = email.Substring(0, email.Length - 1); desc = ""; } string getdetailpage(string url) { WebClient wc = new WebClient(); byte[] b = wc.DownloadData(url); return Encoding.GetEncoding("gb2312").GetString(b); } } 用WebRequest 爬网逻辑很重要 我这段时间正在做一个读取ZHAOPIN.COM和51JOB.COM的软件,大致思路给你说一下:1、使用webrequest/webresponse获取URL的数据2、使用多线程处理3、ZHAOPIN.COM和51JOB.COM的读取URL如下:zhaopin.com:http://search.zhaopin.com/jobs/request.asp?page={0}&SchJobType={1}&SearchModel=051job.com:http://search.51job.com/jobsearch/search_result.php?fromJs=1&funtype=0000&industrytype={0}&issuedate=9&providesalary=99&keywordtype=2&lang=c&stype=2&workyear=99&cotype=99&degreefrom=99&jobterm=01&fromType=1&curr_page={1}4、分别对ZHAOPIN.COM和51JOB.COM的数据用正则做出解析,此处有两部分第一部分是根据(3)的URL获取当前搜索条件的分页数量,获取分页URL,并将分页URL写入到待下载队列中,第二部分是解析具体的职位信息页面,这些都可以用正则解析出来这两个网站没有做IP访问限制 其实这个还不是很复杂,你抓一下GOOGLE或阿里巴巴的企业数据试试,一会就会把你的IP屏蔽了 WebClient tmpClient = new WebClient();Uri uri = new Uri(url, UriKind.Absolute);tmpClient.Headers.Add(HttpRequestHeader.UserAgent, "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)");Stream stream = tmpClient.OpenRead(uri); 1、正则匹配链接2、使用httpwebrequest/httpwebresponse获取URL地址的内容 这个帖子让让人纠结? C#操作Excel刷新数据出现的问题,几个月了呀,高手咋还没出现 winform datagridview 如何添加checkbox这一列,用来选择数据 C#中如何限制textbox中只能输入数字和字母 关于小数位数设置问题(货币类型Money) window media player正在播放什么 dataview筛选 dbo 读取EXCEL 怎么获取EXCEL 行号? 再请大神帮忙正则 继承Canvas的控件,为何放到上面的其他控件看不到? C# 获取邮箱邮件!如126@。com 读数据库然后写文件,速度很慢
有一个简单的例子
private void button1_Click(object sender, EventArgs e)
{
string url = textBox1.Text;
string str = DownHtml(url, System.Text.Encoding.GetEncoding("gb2312"));
Regex re = new Regex("(\\d+-)?(\\d{4}-?\\d{7}|\\d{3}-?\\d{8}|^\\d{7,8})(-\\d+)?");
Match m = re.Match(str);
int i=0;
while (m.Success)
{
dataGridView1.Rows.Add();
dataGridView1.Rows[i++].Cells[0].Value = m.Value; m = m.NextMatch();
}
MessageBox.Show("下载成功!!");
}
/// <summary>
/// 下载网页
/// </summary>
/// <param name="Url">网址</param>
/// <param name="myEncoding">编码</param>
/// <returns>网页内容</returns>
public static string DownHtml(string Url, Encoding myEncoding)
{
try
{
HttpWebRequest loHttp = (HttpWebRequest)WebRequest.Create(Url);
loHttp.Timeout = 500; HttpWebResponse loWebResponse = (HttpWebResponse)loHttp.GetResponse();
StreamReader loResponseStream = new StreamReader(loWebResponse.GetResponseStream(), myEncoding);
string html = loResponseStream.ReadToEnd();
loWebResponse.Close();
return html;
}
catch
{
return null;
}
} private void button2_Click(object sender, EventArgs e)
{
string BoardStream = DownHtml(textBox1 .Text , System.Text.Encoding.GetEncoding("gb2312"));
StreamWriter saveAPage = new StreamWriter(Application .StartupPath +"\\aa.html", false, System.Text.Encoding.GetEncoding("gb2312"));//实例化写入类,保存路径假设为C:\a.html saveAPage.Write(BoardStream);//创建写入任务 saveAPage.Flush();//写入文件(即清理缓存流) saveAPage.Close();//关闭写入类的对象
MessageBox.Show("成功下载!!!");
// File.Open(Application.StartupPath + "\\aa.html", FileMode.Open, FileAccess.Read, FileShare.Read);
System.Diagnostics.Process.Start(Application.StartupPath + "\\aa.html"); }
WebRequest
正则表达式
http://www.cnblogs.com/hfzsjz/archive/2010/01/26/1656722.html
http://download.csdn.net/source/1795639
CSDN的这个资源下载好吗?不知道好不好用,资源分不多了……
{
public int Total = 0;
string Curr_page = null; public Search51job()
{
Get1stPage();
}
string Get1stPage()
{
Uri uri = new Uri("http://search.51job.com/jobsearch/search_result.php");
HttpWebRequest wrq = WebRequest.Create(uri) as HttpWebRequest;
string postdata = "keywordtype=2&jobarea=&funtype=0100&industrytype=01&stype=1&searchname=&fromType=1&keyword=.net&issuedate=2&workyear=&providesalary=&cotype=°reefrom=&jobterm=";
byte[] data = Encoding.GetEncoding("gb2312").GetBytes(postdata); wrq.Method = "POST";
wrq.ContentType = "application/x-www-form-urlencoded";
wrq.ContentLength = data.Length; wrq.CookieContainer = new CookieContainer();
wrq.CookieContainer.Add(uri, new Cookie("guid", "1225348962627790079"));
wrq.CookieContainer.Add(uri, new Cookie("51job", "cenglish%3D0")); wrq.Referer = "http://search.51job.com/jobsearch/advance_search.php"; Stream req = wrq.GetRequestStream();
req.Write(data, 0, data.Length);
req.Close(); HttpWebResponse wrp = wrq.GetResponse() as HttpWebResponse; Stream resp = wrp.GetResponseStream();
StreamReader sr = new StreamReader(resp, Encoding.GetEncoding("gb2312"));
string page = sr.ReadToEnd();
if (Curr_page == null)
Curr_page = page;
sr.Close();
resp.Close();
string pat = "约有\\s{1,}[^0-9]*([0-9]+)[^0-9]*\\s{1,}项符合条件的查询结果";
Match m = Regex.Match(page, pat, RegexOptions.IgnoreCase);
Total = (int)Math.Ceiling(double.Parse(m.Result("$1")) / 30);
return page;
} string GetPage(int i)
{
if (i == 1)
{
return Get1stPage();
}
if (Curr_page == null)
Get1stPage(); Uri uri = new Uri("http://search.51job.com/jobsearch/search_result.php");
HttpWebRequest wrq = WebRequest.Create(uri) as HttpWebRequest;
string postdata = getPostdata(i);
byte[] data = Encoding.GetEncoding("gb2312").GetBytes(postdata); wrq.Method = "POST";
wrq.ContentType = "application/x-www-form-urlencoded";
wrq.ContentLength = data.Length; wrq.CookieContainer = new CookieContainer();
wrq.CookieContainer.Add(uri, new Cookie("guid", "1225348962627790079"));
wrq.CookieContainer.Add(uri, new Cookie("51job", "cenglish%3D0")); wrq.Referer = "http://search.51job.com/jobsearch/search_result.php"; Stream req = wrq.GetRequestStream();
req.Write(data, 0, data.Length);
req.Close(); HttpWebResponse wrp = wrq.GetResponse() as HttpWebResponse; Stream resp = wrp.GetResponseStream();
StreamReader sr = new StreamReader(resp, Encoding.GetEncoding("gb2312"));
string page = sr.ReadToEnd();
if (Curr_page == null)
Curr_page = page;
sr.Close();
resp.Close();
return page;
}
string getPostdata(int n)
{
string[] names = { "postchannel", "stype", "district", "district",
"funtype_big", "funtype", "industrytype", "issuedate",
"keywordtype", "dis_keyword", "keyword", "workyear",
"providesalary", "cotype", "degreefrom", "jobterm",
"ord_field", "list_type", "last_list_type", "curr_page",
"last_page", "nStart", "start_page", "total_page", "jobid_list",
"jobid_count", "schTime", "statCount", "statData", "fromType" }; string post = "";
for (int i = 0; i < names.Length; i++)
{
if (names[i] == "fromType")
post += names[i] + "=14&";
else if (names[i] == "curr_page")
post += names[i] + "=" + n + "&";
else
post += names[i] + "=" + getFieldValue(names[i]) + "&";
}
return post.Substring(0, post.Length - 1); }
string getFieldValue(string name)
{
// string pat = "<input\\s{1,}type=\"hidden\"\\s{1,}name=\""+name+"\"\\s{1,}value=\"([0-9a-zA-Z~\\|\\.]*)\">";
string pat = "<input\\s{1,}type=\"hidden\"\\s{1,}name=\"" + name + "\"\\s{1,}value=\"([^\"]*)\">";
Match match = Regex.Match(Curr_page, pat);
string value = match.Result("$1");
return value; }
public void GetPosition(int n)
{
string page = GetPage(n);
string jobpat = "<a\\s{1,}href=\"\\/jobsearch\\/show_job_detail\\.php\\?id=\\(([0-9]+)\\)\"\\s{1,}onclick=\"javascript:OJDL\\([0-9]+\\)\"\\s{1,}target=\"_blank\"\\s{1,}class=\"jobname\"\\s{1}>([\\w\\W]+?)<\\/a>";
string companypat = "<a\\s{1,}href=\"\\/jobsearch\\/co_all_job\\.php\\?coid=\\(\\d{1,}\\)\"\\s{1,}target=\"_blank\"\\s{1,}class=\"coname\"\\s{1,}>([\\w\\W]+?)<\\/a>"; Match match = Regex.Match(page, jobpat, RegexOptions.IgnoreCase);
Match match1 = Regex.Match(page, companypat, RegexOptions.IgnoreCase);
while (match.Success && match1.Success)
{
string jobid = match.Result("$1");
string jobname = match.Result("$2");
string company = match1.Result("$1");
string email, description;
getcompdetail(jobid, out email, out description);
BusiEntry.Position p = new BusiEntry.Position();
p.jobname = jobname;
p.email = email;
p.company = company;
p.description = description; Dal.SavePosition save = new Dal.SavePosition();
save.SavePos(p); match = match.NextMatch();
match1 = match1.NextMatch();
} }
void getcompdetail(string id, out string email, out string desc)
{
string url = "http://search.51job.com/jobsearch/show_job_detail.php?id=(" + id + ")";
string page = getdetailpage(url);
string emailpat = @"\w+([-+.']\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*";
Regex reg = new Regex(emailpat, RegexOptions.IgnoreCase);
Match m = reg.Match(page);
List<string> list = new List<string>(); while (m.Success)
{
string str = m.ToString();
if ((!list.Contains(str)) && (str.IndexOf("@51job.com")==-1))
list.Add(str);
m = m.NextMatch();
}
email = "";
for (int i = 0; i < list.Count; i++)
email += list[i] + ",";
if (email.Length > 0)
email = email.Substring(0, email.Length - 1);
desc = "";
}
string getdetailpage(string url)
{
WebClient wc = new WebClient();
byte[] b = wc.DownloadData(url);
return Encoding.GetEncoding("gb2312").GetString(b);
} }
1、使用webrequest/webresponse获取URL的数据
2、使用多线程处理
3、ZHAOPIN.COM和51JOB.COM的读取URL如下:
zhaopin.com:http://search.zhaopin.com/jobs/request.asp?page={0}&SchJobType={1}&SearchModel=0
51job.com:http://search.51job.com/jobsearch/search_result.php?fromJs=1&funtype=0000&industrytype={0}&issuedate=9&providesalary=99&keywordtype=2&lang=c&stype=2&workyear=99&cotype=99&degreefrom=99&jobterm=01&fromType=1&curr_page={1}
4、分别对ZHAOPIN.COM和51JOB.COM的数据用正则做出解析,此处有两部分
第一部分是根据(3)的URL获取当前搜索条件的分页数量,获取分页URL,并将分页URL写入到待下载队列中,
第二部分是解析具体的职位信息页面,这些都可以用正则解析出来这两个网站没有做IP访问限制
Uri uri = new Uri(url, UriKind.Absolute);
tmpClient.Headers.Add(HttpRequestHeader.UserAgent, "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)");
Stream stream = tmpClient.OpenRead(uri);
2、使用httpwebrequest/httpwebresponse获取URL地址的内容