面临人才网的采集程序，请大家帮忙

也在学习中
有一个简单的例子
  private void button1_Click(object sender, EventArgs e)
        {
            string url = textBox1.Text;
            string str = DownHtml(url, System.Text.Encoding.GetEncoding("gb2312"));

            Regex re = new Regex("(\\d+-)?(\\d{4}-?\\d{7}|\\d{3}-?\\d{8}|^\\d{7,8})(-\\d+)?");
            Match m = re.Match(str);
            int i=0;
            while (m.Success)
            {
                dataGridView1.Rows.Add();
                dataGridView1.Rows[i++].Cells[0].Value = m.Value;                m = m.NextMatch();
            }
            MessageBox.Show("下载成功!!");
        }
        /// <summary>
        /// 下载网页
        /// </summary>
        /// <param name="Url">网址</param>
        /// <param name="myEncoding">编码</param>
        /// <returns>网页内容</returns>
        public static string DownHtml(string Url, Encoding myEncoding)
        {
            try
            {
                HttpWebRequest loHttp = (HttpWebRequest)WebRequest.Create(Url);
                loHttp.Timeout = 500;                HttpWebResponse loWebResponse = (HttpWebResponse)loHttp.GetResponse();
                StreamReader loResponseStream = new StreamReader(loWebResponse.GetResponseStream(), myEncoding);
                string html = loResponseStream.ReadToEnd();
                loWebResponse.Close();
                return html;
            }
            catch
            {
                return null;
            }
        }        private void button2_Click(object sender, EventArgs e)
        {
            string BoardStream = DownHtml(textBox1 .Text , System.Text.Encoding.GetEncoding("gb2312"));
            StreamWriter saveAPage = new StreamWriter(Application .StartupPath +"\\aa.html", false, System.Text.Encoding.GetEncoding("gb2312"));//实例化写入类，保存路径假设为C:\a.html            saveAPage.Write(BoardStream);//创建写入任务            saveAPage.Flush();//写入文件（即清理缓存流）            saveAPage.Close();//关闭写入类的对象
            MessageBox.Show("成功下载!!!");
          //  File.Open(Application.StartupPath + "\\aa.html", FileMode.Open, FileAccess.Read, FileShare.Read);
            System.Diagnostics.Process.Start(Application.StartupPath + "\\aa.html");        }

WebClient
WebRequest
正则表达式

采集也就是从其他人才网采集信息吧，有过类似的代码采集csdn的

http://www.cnblogs.com/hfzsjz/archive/2010/01/26/1656720.html
http://www.cnblogs.com/hfzsjz/archive/2010/01/26/1656722.html
http://download.csdn.net/source/1795639

http://download.csdn.net/source/1795639
CSDN的这个资源下载好吗？不知道好不好用，资源分不多了……

以前写的一个获取51job的一个Demopublic class Search51job
    {
        public int Total = 0;
        string Curr_page = null;        public Search51job()
        {
            Get1stPage();
        }
        string Get1stPage()
        {
            Uri uri = new Uri("http://search.51job.com/jobsearch/search_result.php");
            HttpWebRequest wrq = WebRequest.Create(uri) as HttpWebRequest;
            string postdata = "keywordtype=2&jobarea=&funtype=0100&industrytype=01&stype=1&searchname=&fromType=1&keyword=.net&issuedate=2&workyear=&providesalary=&cotype=&degreefrom=&jobterm=";
            byte[] data = Encoding.GetEncoding("gb2312").GetBytes(postdata);            wrq.Method = "POST";
            wrq.ContentType = "application/x-www-form-urlencoded";
            wrq.ContentLength = data.Length;            wrq.CookieContainer = new CookieContainer();
            wrq.CookieContainer.Add(uri, new Cookie("guid", "1225348962627790079"));
            wrq.CookieContainer.Add(uri, new Cookie("51job", "cenglish%3D0"));            wrq.Referer = "http://search.51job.com/jobsearch/advance_search.php";            Stream req = wrq.GetRequestStream();
            req.Write(data, 0, data.Length);
            req.Close();            HttpWebResponse wrp = wrq.GetResponse() as HttpWebResponse;            Stream resp = wrp.GetResponseStream();
            StreamReader sr = new StreamReader(resp, Encoding.GetEncoding("gb2312"));
            string page = sr.ReadToEnd();
            if (Curr_page == null)
                Curr_page = page;
            sr.Close();
            resp.Close();
            string pat = "约有\\s{1,}[^0-9]*([0-9]+)[^0-9]*\\s{1,}项符合条件的查询结果";
            Match m = Regex.Match(page, pat, RegexOptions.IgnoreCase);
            Total = (int)Math.Ceiling(double.Parse(m.Result("$1")) / 30);
            return page;
        }        string GetPage(int i)
        {
            if (i == 1)
            {
                return Get1stPage();
            }
            if (Curr_page == null)
                Get1stPage();            Uri uri = new Uri("http://search.51job.com/jobsearch/search_result.php");
            HttpWebRequest wrq = WebRequest.Create(uri) as HttpWebRequest;
            string postdata = getPostdata(i);
            byte[] data = Encoding.GetEncoding("gb2312").GetBytes(postdata);            wrq.Method = "POST";
            wrq.ContentType = "application/x-www-form-urlencoded";
            wrq.ContentLength = data.Length;            wrq.CookieContainer = new CookieContainer();
            wrq.CookieContainer.Add(uri, new Cookie("guid", "1225348962627790079"));
            wrq.CookieContainer.Add(uri, new Cookie("51job", "cenglish%3D0"));            wrq.Referer = "http://search.51job.com/jobsearch/search_result.php";            Stream req = wrq.GetRequestStream();
            req.Write(data, 0, data.Length);
            req.Close();            HttpWebResponse wrp = wrq.GetResponse() as HttpWebResponse;            Stream resp = wrp.GetResponseStream();
            StreamReader sr = new StreamReader(resp, Encoding.GetEncoding("gb2312"));
            string page = sr.ReadToEnd();
            if (Curr_page == null)
                Curr_page = page;
            sr.Close();
            resp.Close();
            return page;
        }
        string getPostdata(int n)
        {
            string[] names = { "postchannel", "stype", "district", "district",
                               "funtype_big", "funtype", "industrytype", "issuedate",
                               "keywordtype", "dis_keyword", "keyword", "workyear",
                               "providesalary", "cotype", "degreefrom", "jobterm",
                               "ord_field", "list_type", "last_list_type", "curr_page",
                               "last_page", "nStart", "start_page", "total_page", "jobid_list",
                               "jobid_count", "schTime", "statCount", "statData", "fromType" };            string post = "";
            for (int i = 0; i < names.Length; i++)
            {
                if (names[i] == "fromType")
                    post += names[i] + "=14&";
                else if (names[i] == "curr_page")
                    post += names[i] + "=" + n + "&";
                else
                    post += names[i] + "=" + getFieldValue(names[i]) + "&";
            }
            return post.Substring(0, post.Length - 1);        }
        string getFieldValue(string name)
        {
            //  string pat = "<input\\s{1,}type=\"hidden\"\\s{1,}name=\""+name+"\"\\s{1,}value=\"([0-9a-zA-Z~\\|\\.]*)\">";
            string pat = "<input\\s{1,}type=\"hidden\"\\s{1,}name=\"" + name + "\"\\s{1,}value=\"([^\"]*)\">";
            Match match = Regex.Match(Curr_page, pat);
            string value = match.Result("$1");
            return value;        }
        public void GetPosition(int n)
        {
            string page = GetPage(n);
            string jobpat = "<a\\s{1,}href=\"\\/jobsearch\\/show_job_detail\\.php\\?id=\$([0-9]+)\$\"\\s{1,}onclick=\"javascript:OJDL\$[0-9]+\$\"\\s{1,}target=\"_blank\"\\s{1,}class=\"jobname\"\\s{1}>([\\w\\W]+?)<\\/a>";
            string companypat = "<a\\s{1,}href=\"\\/jobsearch\\/co_all_job\\.php\\?coid=\$\\d{1,}\$\"\\s{1,}target=\"_blank\"\\s{1,}class=\"coname\"\\s{1,}>([\\w\\W]+?)<\\/a>";                       Match match = Regex.Match(page, jobpat, RegexOptions.IgnoreCase);
            Match match1 = Regex.Match(page, companypat, RegexOptions.IgnoreCase);
            while (match.Success && match1.Success)
            {
                string jobid = match.Result("$1");
                string jobname = match.Result("$2");
                string company = match1.Result("$1");
                string email, description;
                getcompdetail(jobid, out email, out description);
                BusiEntry.Position p = new BusiEntry.Position();
                p.jobname = jobname;
                p.email = email;
                p.company = company;
                p.description = description;                Dal.SavePosition save = new Dal.SavePosition();
                save.SavePos(p);                match = match.NextMatch();
                match1 = match1.NextMatch();
            }        }
        void getcompdetail(string id, out string email, out string desc)
        {
            string url = "http://search.51job.com/jobsearch/show_job_detail.php?id=(" + id + ")";
            string page = getdetailpage(url);
            string emailpat = @"\w+([-+.']\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*";
            Regex reg = new Regex(emailpat, RegexOptions.IgnoreCase);
            Match m = reg.Match(page);
            List<string> list = new List<string>();            while (m.Success)
            {
                string str = m.ToString();
                if ((!list.Contains(str)) && (str.IndexOf("@51job.com")==-1))
                    list.Add(str);
                m = m.NextMatch();
            }
            email = "";
            for (int i = 0; i < list.Count; i++)
                email += list[i] + ",";
            if (email.Length > 0)
               email = email.Substring(0, email.Length - 1);
            desc = "";
        }
        string getdetailpage(string url)
        {
            WebClient wc = new WebClient();
            byte[] b = wc.DownloadData(url);
            return Encoding.GetEncoding("gb2312").GetString(b);
        }    }

用WebRequest 爬网逻辑很重要

我这段时间正在做一个读取ZHAOPIN.COM和51JOB.COM的软件，大致思路给你说一下:
1、使用webrequest/webresponse获取URL的数据
2、使用多线程处理
3、ZHAOPIN.COM和51JOB.COM的读取URL如下:
zhaopin.com:http://search.zhaopin.com/jobs/request.asp?page={0}&SchJobType={1}&SearchModel=0
51job.com:http://search.51job.com/jobsearch/search_result.php?fromJs=1&funtype=0000&industrytype={0}&issuedate=9&providesalary=99&keywordtype=2&lang=c&stype=2&workyear=99&cotype=99&degreefrom=99&jobterm=01&fromType=1&curr_page={1}
4、分别对ZHAOPIN.COM和51JOB.COM的数据用正则做出解析，此处有两部分
第一部分是根据（3）的URL获取当前搜索条件的分页数量，获取分页URL，并将分页URL写入到待下载队列中，
第二部分是解析具体的职位信息页面，这些都可以用正则解析出来这两个网站没有做IP访问限制

其实这个还不是很复杂，你抓一下GOOGLE或阿里巴巴的企业数据试试，一会就会把你的IP屏蔽了

WebClient tmpClient = new WebClient();
Uri uri = new Uri(url, UriKind.Absolute);
tmpClient.Headers.Add(HttpRequestHeader.UserAgent, "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)");
Stream stream = tmpClient.OpenRead(uri);

1、正则匹配链接
2、使用httpwebrequest/httpwebresponse获取URL地址的内容

调试易

面临人才网的采集程序，请大家帮忙

解决方案 »