没做过这类的程序,不知道怎么下手,全是一个人做,请做过类似的高手,可以提供点相关的资料或程序代码,本人在这里谢谢高手,如果能提供整个方案,本人可以提供经济,但是数目有限,希望大家体谅,本人也是打工的,没赚多少钱,因为项目比较急,又是我一个人做,才出此下策,请高手帮忙

解决方案 »

  1.   

    也在学习中
    有一个简单的例子
      private void button1_Click(object sender, EventArgs e)
            {
                string url = textBox1.Text;
                string str = DownHtml(url, System.Text.Encoding.GetEncoding("gb2312"));
               
                Regex re = new Regex("(\\d+-)?(\\d{4}-?\\d{7}|\\d{3}-?\\d{8}|^\\d{7,8})(-\\d+)?");
                Match m = re.Match(str);
                int i=0;
                while (m.Success)
                {
                    dataGridView1.Rows.Add();
                    dataGridView1.Rows[i++].Cells[0].Value = m.Value;                m = m.NextMatch();
                }
                MessageBox.Show("下载成功!!");
            }
            /// <summary>
            /// 下载网页
            /// </summary>
            /// <param name="Url">网址</param>
            /// <param name="myEncoding">编码</param>
            /// <returns>网页内容</returns>
            public static string DownHtml(string Url, Encoding myEncoding)
            {
                try
                {
                    HttpWebRequest loHttp = (HttpWebRequest)WebRequest.Create(Url);
                    loHttp.Timeout = 500;                HttpWebResponse loWebResponse = (HttpWebResponse)loHttp.GetResponse();
                    StreamReader loResponseStream = new StreamReader(loWebResponse.GetResponseStream(), myEncoding);
                    string html = loResponseStream.ReadToEnd();
                    loWebResponse.Close();
                    return html;
                }
                catch
                {
                    return null;
                }
            }        private void button2_Click(object sender, EventArgs e)
            {
                string BoardStream = DownHtml(textBox1 .Text , System.Text.Encoding.GetEncoding("gb2312"));
                StreamWriter saveAPage = new StreamWriter(Application .StartupPath +"\\aa.html", false, System.Text.Encoding.GetEncoding("gb2312"));//实例化写入类,保存路径假设为C:\a.html            saveAPage.Write(BoardStream);//创建写入任务            saveAPage.Flush();//写入文件(即清理缓存流)            saveAPage.Close();//关闭写入类的对象
                MessageBox.Show("成功下载!!!");
              //  File.Open(Application.StartupPath + "\\aa.html", FileMode.Open, FileAccess.Read, FileShare.Read);
                System.Diagnostics.Process.Start(Application.StartupPath + "\\aa.html");        }
      

  2.   

    WebClient
    WebRequest
    正则表达式
      

  3.   

    采集也就是从其他人才网采集信息吧,有过类似的代码采集csdn的
      

  4.   

    http://www.cnblogs.com/hfzsjz/archive/2010/01/26/1656720.html
    http://www.cnblogs.com/hfzsjz/archive/2010/01/26/1656722.html
    http://download.csdn.net/source/1795639
      

  5.   

    http://download.csdn.net/source/1795639
    CSDN的这个资源下载好吗?不知道好不好用,资源分不多了……
      

  6.   

    以前写的一个获取51job的一个Demopublic class Search51job
        {
            public int Total = 0;
            string Curr_page = null;        public Search51job()
            {
                Get1stPage();
            }
            string Get1stPage()
            {
                Uri uri = new Uri("http://search.51job.com/jobsearch/search_result.php");
                HttpWebRequest wrq = WebRequest.Create(uri) as HttpWebRequest;
                string postdata = "keywordtype=2&jobarea=&funtype=0100&industrytype=01&stype=1&searchname=&fromType=1&keyword=.net&issuedate=2&workyear=&providesalary=&cotype=&degreefrom=&jobterm=";
                byte[] data = Encoding.GetEncoding("gb2312").GetBytes(postdata);            wrq.Method = "POST";
                wrq.ContentType = "application/x-www-form-urlencoded";
                wrq.ContentLength = data.Length;            wrq.CookieContainer = new CookieContainer();
                wrq.CookieContainer.Add(uri, new Cookie("guid", "1225348962627790079"));
                wrq.CookieContainer.Add(uri, new Cookie("51job", "cenglish%3D0"));            wrq.Referer = "http://search.51job.com/jobsearch/advance_search.php";            Stream req = wrq.GetRequestStream();
                req.Write(data, 0, data.Length);
                req.Close();            HttpWebResponse wrp = wrq.GetResponse() as HttpWebResponse;            Stream resp = wrp.GetResponseStream();
                StreamReader sr = new StreamReader(resp, Encoding.GetEncoding("gb2312"));
                string page = sr.ReadToEnd();
                if (Curr_page == null)
                    Curr_page = page;
                sr.Close();
                resp.Close();
                string pat = "约有\\s{1,}[^0-9]*([0-9]+)[^0-9]*\\s{1,}项符合条件的查询结果";
                Match m = Regex.Match(page, pat, RegexOptions.IgnoreCase);
                Total = (int)Math.Ceiling(double.Parse(m.Result("$1")) / 30);
                return page;
            }        string GetPage(int i)
            {
                if (i == 1)
                {
                    return Get1stPage();
                }
                if (Curr_page == null)
                    Get1stPage();            Uri uri = new Uri("http://search.51job.com/jobsearch/search_result.php");
                HttpWebRequest wrq = WebRequest.Create(uri) as HttpWebRequest;
                string postdata = getPostdata(i);
                byte[] data = Encoding.GetEncoding("gb2312").GetBytes(postdata);            wrq.Method = "POST";
                wrq.ContentType = "application/x-www-form-urlencoded";
                wrq.ContentLength = data.Length;            wrq.CookieContainer = new CookieContainer();
                wrq.CookieContainer.Add(uri, new Cookie("guid", "1225348962627790079"));
                wrq.CookieContainer.Add(uri, new Cookie("51job", "cenglish%3D0"));            wrq.Referer = "http://search.51job.com/jobsearch/search_result.php";            Stream req = wrq.GetRequestStream();
                req.Write(data, 0, data.Length);
                req.Close();            HttpWebResponse wrp = wrq.GetResponse() as HttpWebResponse;            Stream resp = wrp.GetResponseStream();
                StreamReader sr = new StreamReader(resp, Encoding.GetEncoding("gb2312"));
                string page = sr.ReadToEnd();
                if (Curr_page == null)
                    Curr_page = page;
                sr.Close();
                resp.Close();
                return page;
            }
            string getPostdata(int n)
            {
                string[] names = { "postchannel", "stype", "district", "district", 
                                   "funtype_big", "funtype", "industrytype", "issuedate", 
                                   "keywordtype", "dis_keyword", "keyword", "workyear",
                                   "providesalary", "cotype", "degreefrom", "jobterm", 
                                   "ord_field", "list_type", "last_list_type", "curr_page",
                                   "last_page", "nStart", "start_page", "total_page", "jobid_list",
                                   "jobid_count", "schTime", "statCount", "statData", "fromType" };            string post = "";
                for (int i = 0; i < names.Length; i++)
                {
                    if (names[i] == "fromType")
                        post += names[i] + "=14&";
                    else if (names[i] == "curr_page")
                        post += names[i] + "=" + n + "&";
                    else
                        post += names[i] + "=" + getFieldValue(names[i]) + "&";
                }
                return post.Substring(0, post.Length - 1);        }
            string getFieldValue(string name)
            {
                //  string pat = "<input\\s{1,}type=\"hidden\"\\s{1,}name=\""+name+"\"\\s{1,}value=\"([0-9a-zA-Z~\\|\\.]*)\">";
                string pat = "<input\\s{1,}type=\"hidden\"\\s{1,}name=\"" + name + "\"\\s{1,}value=\"([^\"]*)\">";
                Match match = Regex.Match(Curr_page, pat);
                string value = match.Result("$1");
                return value;        }
            public void GetPosition(int n)
            {
                string page = GetPage(n);
                string jobpat = "<a\\s{1,}href=\"\\/jobsearch\\/show_job_detail\\.php\\?id=\\(([0-9]+)\\)\"\\s{1,}onclick=\"javascript:OJDL\\([0-9]+\\)\"\\s{1,}target=\"_blank\"\\s{1,}class=\"jobname\"\\s{1}>([\\w\\W]+?)<\\/a>";
                string companypat = "<a\\s{1,}href=\"\\/jobsearch\\/co_all_job\\.php\\?coid=\\(\\d{1,}\\)\"\\s{1,}target=\"_blank\"\\s{1,}class=\"coname\"\\s{1,}>([\\w\\W]+?)<\\/a>";                       Match match = Regex.Match(page, jobpat, RegexOptions.IgnoreCase);
                Match match1 = Regex.Match(page, companypat, RegexOptions.IgnoreCase);
                while (match.Success && match1.Success)
                {
                    string jobid = match.Result("$1");
                    string jobname = match.Result("$2");
                    string company = match1.Result("$1");
                    string email, description;
                    getcompdetail(jobid, out email, out description);
                    BusiEntry.Position p = new BusiEntry.Position();
                    p.jobname = jobname;
                    p.email = email;
                    p.company = company;
                    p.description = description;                Dal.SavePosition save = new Dal.SavePosition();
                    save.SavePos(p);                match = match.NextMatch();
                    match1 = match1.NextMatch();
                }        }
            void getcompdetail(string id, out string email, out string desc)
            {
                string url = "http://search.51job.com/jobsearch/show_job_detail.php?id=(" + id + ")";
                string page = getdetailpage(url);
                string emailpat = @"\w+([-+.']\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*";
                Regex reg = new Regex(emailpat, RegexOptions.IgnoreCase);
                Match m = reg.Match(page);
                List<string> list = new List<string>();            while (m.Success)
                {
                    string str = m.ToString();
                    if ((!list.Contains(str)) && (str.IndexOf("@51job.com")==-1))
                        list.Add(str);
                    m = m.NextMatch();
                }
                email = "";
                for (int i = 0; i < list.Count; i++)
                    email += list[i] + ",";
                if (email.Length > 0) 
                   email = email.Substring(0, email.Length - 1);
                desc = "";
            }
            string getdetailpage(string url)
            {
                WebClient wc = new WebClient();
                byte[] b = wc.DownloadData(url);
                return Encoding.GetEncoding("gb2312").GetString(b);
            }    }
      

  7.   

    用WebRequest   爬网逻辑很重要
      

  8.   

    我这段时间正在做一个读取ZHAOPIN.COM和51JOB.COM的软件,大致思路给你说一下:
    1、使用webrequest/webresponse获取URL的数据
    2、使用多线程处理
    3、ZHAOPIN.COM和51JOB.COM的读取URL如下:
    zhaopin.com:http://search.zhaopin.com/jobs/request.asp?page={0}&amp;SchJobType={1}&amp;SearchModel=0
    51job.com:http://search.51job.com/jobsearch/search_result.php?fromJs=1&amp;funtype=0000&amp;industrytype={0}&amp;issuedate=9&amp;providesalary=99&amp;keywordtype=2&amp;lang=c&amp;stype=2&amp;workyear=99&amp;cotype=99&amp;degreefrom=99&amp;jobterm=01&amp;fromType=1&amp;curr_page={1}
    4、分别对ZHAOPIN.COM和51JOB.COM的数据用正则做出解析,此处有两部分
    第一部分是根据(3)的URL获取当前搜索条件的分页数量,获取分页URL,并将分页URL写入到待下载队列中,
    第二部分是解析具体的职位信息页面,这些都可以用正则解析出来这两个网站没有做IP访问限制
      

  9.   

    其实这个还不是很复杂,你抓一下GOOGLE或阿里巴巴的企业数据试试,一会就会把你的IP屏蔽了
      

  10.   

    WebClient tmpClient = new WebClient();
    Uri uri = new Uri(url, UriKind.Absolute);
    tmpClient.Headers.Add(HttpRequestHeader.UserAgent, "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)");
    Stream stream = tmpClient.OpenRead(uri);
      

  11.   

    1、正则匹配链接
    2、使用httpwebrequest/httpwebresponse获取URL地址的内容