哪位大虾有C#提取网页正文内容的代码,可不可以发上来我参考参考。谢谢啦!!

解决方案 »

  1.   

    /// <summary>
            /// 重载GetData函数,多线程调用该函数
            /// </summary>
            /// <param name="en"></param>
            public void GetDataOnline()
            {
                #region 在线程中用到的查询函数
                DCurrentState oDCurrentState = new DCurrentState(CurrentState);//声明代理
                this.Invoke(oDCurrentState, "Doing", null);
                Thread.Sleep(100);            string en = this.textBox1.Text.ToString();
                WebRequest oRequest = WebRequest.Create("http://dict.cn/search/?q=" + en);
                //oRequest.Timeout = 800;超时时间
                WebResponse oResponse = oRequest.GetResponse();
                Stream oStream = oResponse.GetResponseStream();
                StreamReader oReader = new StreamReader(oStream, Encoding.Default);
                string oGetData = oReader.ReadToEnd();
                string oRegexPat = @"<big><font\040size=\""2\""\040face=\""Trebuchet\040MS\"">([\w\W]*?)</big>";
                Regex oRegex = new Regex(oRegexPat, RegexOptions.IgnoreCase);
                Match oMatch = oRegex.Match(oGetData);
                #endregion            #region 如果匹配成功
                if (oMatch.Success)
                {
                    string[] CurrentData = oRegex.Split(oGetData);
                    if (CurrentData[1].IndexOf("对不起") >= 0 || CurrentData[1].IndexOf("单词没找到") >= 0)
                    {
                        //没有查询到该单词
                        this.Invoke(oDCurrentState, "Failure", " ");
                        return;
                    }
                    else
                    {
                        //查询到有数据,如果自动更新开关为on,
                        if (this.autoUpdate.Checked)
                        {
                            #region 更新本地数据
                            this.Invoke(oDCurrentState, "Updated", CurrentData[1].Replace("<br>", "\r\n"));
                            Dict d = new Dict(en);
                            d._En = en;
                            d._Cn = CurrentData[1];
                            d.Update(d);
                            #endregion
                        }
                        else
                        {
                            #region 保存结果
                            this.Invoke(oDCurrentState, "Finish", CurrentData[1].Replace("<br>", "\r\n"));
                            Dict d = new Dict(en);
                            d._En = en;
                            d._Cn = CurrentData[1];                        try
                            {
                                d.Insert(d);
                            }
                            catch (Exception exception)
                            {                            throw exception;
                            }
                            finally
                            {
                                d = null;
                            }
                            #endregion
                        }
                        return;
                    }
                }
                #endregion
                return;        }
      

  2.   

    HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);
    req.Method = "GET"; or "POST"
    req.ContentType = "application/x-www-form-urlencoded";Stream ReceiveStream = res.GetResponseStream();
      

  3.   

    public static int saveHtmlFile(string url,string filename)
    {
    int status = -1; 
    string respHTML = string.Empty;
    StreamWriter sw = null;
    try 
    {
    if(ReadHttp(url,ref respHTML)=="OK") 
    {
    if(File.Exists(filename)) 
    {
    File.Copy(filename,filename+".bak",true);
    }
    sw = new StreamWriter(filename,false,Encoding.GetEncoding("GB2312"));
    sw.WriteLine(respHTML);
    sw.Close();
    status = 0;
    }
    else
    {
    System.Web.HttpContext.Current.Response.Write("找不到该页或服务器错误");
    }

    catch(Exception err)
    {
    System.Web.HttpContext.Current.Response.Write(err.Message);
    status = -1;

    finally 
    {
    if (sw != null) 
    {
    sw.Close();
    }
    }
    return(status);
    }public static string ReadHttp(string url,ref string content) 
    {
    string status="ERROR";
    HttpWebRequest Webreq = (HttpWebRequest) WebRequest.Create(url);
    HttpWebResponse Webresp=null;
    StreamReader strm = null;
    try 
    {
    Webresp = (HttpWebResponse) Webreq.GetResponse();
    status = Webresp.StatusCode.ToString();
    strm = new StreamReader(Webresp.GetResponseStream(),Encoding.GetEncoding("GB2312"));
    content = strm.ReadToEnd();

    catch
    {

    finally 
    {
    if(Webresp != null) Webresp.Close();
    if(strm != null) strm.Close();
    }
    return(status);
    }
      

  4.   

    1.用正则。
    2.string.substring(),string.indexof(),etc.
      

  5.   

    (?#Copyright 2005, by Laser Lu.)(?<Style_Block>(?<begin>\<(?<tag>style)(?:\s+(?<attribute>[\w-:]+)(?:=(?<value>[^\s\>\<]*|\"[\s\S]*?\"|\'[\s\S]*?\'))?)*\s*(?:/)?\>)(?<body>[\s\S]*?)(?<end>\</\k<tag>\>))|(?<Script_Block>(?<begin>\<(?<tag>script)(?:\s+(?<attribute>[\w-:]+)(?:=(?<value>[^\s\>\<]*|\"[\s\S]*?\"|\'[\s\S]*?\'))?)*\s*(?:/)?\>)(?<body>[\s\S]*?)(?<end>\</\k<tag>\>))|(?<Xml_Directive>\<!(?<name>[\w-:]+)(?:\s+(?<argument>[\w-:]+|\"[\s\S]*?\"|\'[\s\S]*?\'))*\s*\>)|(?<Xml_Comment>\<!--[\s\S]*?--\>)|(?<Beginning_Tag>\<(?<tag>[\w-:]+)(?:\s+(?<attribute>[\w-:]+)(?:=(?<value>[^\s\>\<]*|\"[\s\S]*?\"|\'[\s\S]*?\'))?)*\s*(?:/)?\>)|(?<Ending_Tag>\</(?<tag>[\w-:]+)\>)|(?<Xml_CDATA>\<!\[CDATA\[(?<data>[\s\S]*?)\]\]\>)|(?<Xml_Literal>(?:(?<blank>[ ]+)|[^ \<\>])+)