using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Collections;
using System.Text.RegularExpressions;
using System.IO;
namespace HtmlPerTJ
{
    class HtmlParSer
    {
        public string TagName;//标签名;
        public SortedList TagAttribute = new SortedList();//属性;
        public int StarPos;//开始字符数;
        public int EndPos;//结束定位数;
        public string endTagName="0";//结束标签
    }
    class HtmpPerSerClass
    {
        public string Html;//文件流对像;
        public char[] Htmls;//文件流对像数组;
        private int HtmlEndPos=0;//文件结尾指针;
        public ArrayList MyList = new ArrayList();//保存HTML标签流
        public int Count = 0;
        //初始化
        public void CreateHtmlPerSer(string Str)
        {
            Html = Str;
            Htmls = Str.ToCharArray();
            HtmlEndPos = Htmls.Length;
            for (int i = 0; i < Htmls.Length; i++)
            {
                if(Htmls[i]=='<')
                returnTag(i);
            }            
        }
        //初始化
        public string getConten(string tagName)
        {
            ArrayList ContentList = new ArrayList();
            ContentList = getTag(tagName);
            int MaxLength = 0;
            int GetStrStar = 0;
            int GetStrEnd = 0;
            /*
            for (int j = 0; j < ContentList.Count; j++)
            {
                for (int c = 0; c < ContentList.Count; c++)
                {
                    if (((HtmlParSer)ContentList[j]).StarPos > ((HtmlParSer)ContentList[c]).StarPos & ((HtmlParSer)ContentList[j]).EndPos <((HtmlParSer)ContentList[c]).EndPos)
                        ContentList.Remove(ContentList[j]);
                }
            }
             */
            for (int i = 0; i < ContentList.Count; i++)
                {
                    int stard = ((HtmlParSer)ContentList[i]).StarPos;
                    int endHtml = ((HtmlParSer)ContentList[i]).EndPos;
                    if (endHtml - stard > MaxLength)
                    {
                        MaxLength = endHtml - stard;
                        GetStrStar = stard;
                        GetStrEnd = endHtml;
                    }
                }
            return Regex.Replace(Html.Substring(GetStrStar, GetStrEnd - GetStrStar + 1), @"<[^>]+>|\n[\s| ]*\r|(^\s*)|(\s*$)|\r\n| | ", "", RegexOptions.IgnoreCase);
        }
        //获取TAG   
        public ArrayList getTag(string TagNames)
        {
            string[] TagName = TagNames.Split(','); 
            ArrayList TempList = new ArrayList();
            for (int TagNameCount = 0; TagNameCount < TagName.Length; TagNameCount++)
            {
                if (!Regex.IsMatch(TagName[TagNameCount], "<", RegexOptions.IgnoreCase))
                    TagName[TagNameCount] = "<" + TagName[TagNameCount];
                if (!Regex.IsMatch(TagName[TagNameCount], ">", RegexOptions.IgnoreCase))
                    TagName[TagNameCount] = TagName[TagNameCount] + ">";
                TagName[TagNameCount] = TagName[TagNameCount].ToLower();
               
                for (int i = 0; i < MyList.Count; i++)
                {
                    if (((HtmlParSer)MyList[i]).TagName == TagName[TagNameCount])
                    {
                        TempList.Add(MyList[i]);
                    }
                }
            }
            return TempList;
        }
        //获取TAG
        //分析属性
        private void GetAttribute(int starPos,string AttStr)
        {
            HtmlParSer HtmlParTemp = new HtmlParSer();
            HtmlParTemp.EndPos = starPos+AttStr.Length;
            AttStr = Regex.Replace(AttStr,"\"|'","",RegexOptions.IgnoreCase);
            if(!Regex.IsMatch(AttStr," >",RegexOptions.IgnoreCase))
            AttStr = Regex.Replace(AttStr, ">", " >", RegexOptions.IgnoreCase);
            AttStr = Regex.Replace(AttStr, " ", "|", RegexOptions.IgnoreCase);
            AttStr = AttStr.Replace("|>",">");
                string[] AttValueTemp=AttStr.Split('|');
                for (int i = 0; i < AttValueTemp.Length; i++)
                {
                    string[] ValueTemp = AttValueTemp[i].Split('=');
                    if (!(ValueTemp.Length != 2))
                    {
                        try
                        {
                            if(ValueTemp[0]!="")
                            HtmlParTemp.TagAttribute.Add(ValueTemp[0].ToString(), ValueTemp[1].ToString().Replace(">",""));
                        }
                        catch (Exception e)
                        {
                        }
                    }
                    else
                    {
                        if (Regex.IsMatch(ValueTemp[0],"<",RegexOptions.IgnoreCase))
                            HtmlParTemp.TagName = ValueTemp[0];
                        if (!Regex.IsMatch(HtmlParTemp.TagName, ">", RegexOptions.IgnoreCase))
                            HtmlParTemp.TagName += ">";
                    }
                }
            HtmlParTemp.StarPos = starPos;
            MyList.Add(HtmlParTemp);
            Count = MyList.Count;
        }
        //分析属性
        //获取块结尾
        private void setEndPos(int endPos,string EndHtml)
        {
            EndHtml = EndHtml.Replace("/", "");
            if(!Regex.IsMatch(EndHtml,">",RegexOptions.IgnoreCase))
            EndHtml = EndHtml.Replace(" ", "")+">";
            int PostArrayI=-1;
            for (int i = 0; i < MyList.Count; i++)
            {
                if (((HtmlParSer)MyList[i]).TagName == EndHtml & ((HtmlParSer)MyList[i]).endTagName=="0")
                {
                    PostArrayI = i;
                }
            }
            if (PostArrayI >= 0)
            {
                ((HtmlParSer)MyList[PostArrayI]).EndPos = endPos;
                ((HtmlParSer)MyList[PostArrayI]).endTagName = EndHtml.Replace("<","</");
            }
        }
        //获取块结尾
        //获得标签名
        private void returnTag(int i)
        {
            string TagHtml = "";
            bool get = true;
            for (int j = i; j < Htmls.Length; j++)
            {
                if (j + 1 < Htmls.Length)
                {
                    if (Htmls[j] == '/')
                    {
                        if (Htmls[j - 1] == '<')
                            get = false;
                    }
                    else
                    {
                        if (Htmls[j] == '>')
                        {
                            break;
                        }
                    }
                }
                TagHtml += Htmls[j];
            }
            if (get)
                GetAttribute(i, TagHtml);
            else
            {
                setEndPos(i + TagHtml.Length, TagHtml);
            }
        }
        //获取标签名
    }
}

解决方案 »

  1.   

    看着就晕,也没有描述一下大体想干什么。一般对于网页的解析可以两步轻松完成。
    [1]使用SgmlReader规范化页面,使符合XML规范
    [2]使用xpath语法,任意处置标签其它说明:
    [1]如果面页本身是规范的,可跳过第一步
    [2]SgmlReader也不是尽善尽美,格式化出来的XML有时候嵌套关系会搞错,但即使这样,根据他格式化出来的文本,用来提取数据也足够了。当我有很多的替代方案。
    [3]简单规则可以考虑正则,效率高一些。
      

  2.   

    通过正则表达式\Xpath,webbrower都可解析html
    string tag = @"(?:[\w-:]+)"; 
     string attribute = @"(?:[\w-:]+)(?:=(?:[^\s\>\<]*|\""[\s\S]*?\""|\'[\s\S]*?\'))?"; 
     string name = @"(?:[\w-:]+)"; 
     string argument = @"(?:[\w-:]+|\""[\s\S]*?\""|\'[\s\S]*?\')"; 
     string beginningTag = @"(?:\<" + tag + @"(?:\s+" +attribute + @")*\s*(?:/)?\>)"; 
     string endingTag = @"(?:\</" + tag + @"\>)"; 
     string xmlComment = @"(?:\<!--[\s\S]*?--\>)"; 
     string xmlDirective = @"(?:\<!" +name + @"(?:\s+" +argument + @")*\s*\>)"; 
     string xmlCData = @"(?:\<!\[CDATA\[(?:[\s\S]*?)\]\]\>)"; 
     string styleBlock = @"(?:(?:\<(?:Style)(?:\s+" +attribute + @")*\s*(?:/)?\>)(?:[\s\S]*?)(?:\</(?:Style)\>))"; 
     string scriptBlock = @"(?:(?:\<(?:script)(?:\s+" +attribute + @")*\s*(?:/)?\>)(?:[\s\S]*?)(?:\</(?:script)\>))"; 
     string xmlLiteral = @"(?:(?:(?<blank>[ ]+)|[^ \<\>])+)"; public static string getTitle(string strHtml)
             {
                  string title="";
                  Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase);
                  Match mc = reg.Match(strHtml);
                if (mc.Success)
                      title = mc.Groups["title"].Value.Trim();
                 return title;
           } 
      

  3.   

    把网络上任意页面HTML代码分成几个块。计算各块的权重好提取主要内容。