using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Collections;
using System.Text.RegularExpressions;
using System.IO;
namespace HtmlPerTJ
{
class HtmlParSer
{
public string TagName;//标签名;
public SortedList TagAttribute = new SortedList();//属性;
public int StarPos;//开始字符数;
public int EndPos;//结束定位数;
public string endTagName="0";//结束标签
}
class HtmpPerSerClass
{
public string Html;//文件流对像;
public char[] Htmls;//文件流对像数组;
private int HtmlEndPos=0;//文件结尾指针;
public ArrayList MyList = new ArrayList();//保存HTML标签流
public int Count = 0;
//初始化
public void CreateHtmlPerSer(string Str)
{
Html = Str;
Htmls = Str.ToCharArray();
HtmlEndPos = Htmls.Length;
for (int i = 0; i < Htmls.Length; i++)
{
if(Htmls[i]=='<')
returnTag(i);
}
}
//初始化
public string getConten(string tagName)
{
ArrayList ContentList = new ArrayList();
ContentList = getTag(tagName);
int MaxLength = 0;
int GetStrStar = 0;
int GetStrEnd = 0;
/*
for (int j = 0; j < ContentList.Count; j++)
{
for (int c = 0; c < ContentList.Count; c++)
{
if (((HtmlParSer)ContentList[j]).StarPos > ((HtmlParSer)ContentList[c]).StarPos & ((HtmlParSer)ContentList[j]).EndPos <((HtmlParSer)ContentList[c]).EndPos)
ContentList.Remove(ContentList[j]);
}
}
*/
for (int i = 0; i < ContentList.Count; i++)
{
int stard = ((HtmlParSer)ContentList[i]).StarPos;
int endHtml = ((HtmlParSer)ContentList[i]).EndPos;
if (endHtml - stard > MaxLength)
{
MaxLength = endHtml - stard;
GetStrStar = stard;
GetStrEnd = endHtml;
}
}
return Regex.Replace(Html.Substring(GetStrStar, GetStrEnd - GetStrStar + 1), @"<[^>]+>|\n[\s| ]*\r|(^\s*)|(\s*$)|\r\n| | ", "", RegexOptions.IgnoreCase);
}
//获取TAG
public ArrayList getTag(string TagNames)
{
string[] TagName = TagNames.Split(',');
ArrayList TempList = new ArrayList();
for (int TagNameCount = 0; TagNameCount < TagName.Length; TagNameCount++)
{
if (!Regex.IsMatch(TagName[TagNameCount], "<", RegexOptions.IgnoreCase))
TagName[TagNameCount] = "<" + TagName[TagNameCount];
if (!Regex.IsMatch(TagName[TagNameCount], ">", RegexOptions.IgnoreCase))
TagName[TagNameCount] = TagName[TagNameCount] + ">";
TagName[TagNameCount] = TagName[TagNameCount].ToLower();
for (int i = 0; i < MyList.Count; i++)
{
if (((HtmlParSer)MyList[i]).TagName == TagName[TagNameCount])
{
TempList.Add(MyList[i]);
}
}
}
return TempList;
}
//获取TAG
//分析属性
private void GetAttribute(int starPos,string AttStr)
{
HtmlParSer HtmlParTemp = new HtmlParSer();
HtmlParTemp.EndPos = starPos+AttStr.Length;
AttStr = Regex.Replace(AttStr,"\"|'","",RegexOptions.IgnoreCase);
if(!Regex.IsMatch(AttStr," >",RegexOptions.IgnoreCase))
AttStr = Regex.Replace(AttStr, ">", " >", RegexOptions.IgnoreCase);
AttStr = Regex.Replace(AttStr, " ", "|", RegexOptions.IgnoreCase);
AttStr = AttStr.Replace("|>",">");
string[] AttValueTemp=AttStr.Split('|');
for (int i = 0; i < AttValueTemp.Length; i++)
{
string[] ValueTemp = AttValueTemp[i].Split('=');
if (!(ValueTemp.Length != 2))
{
try
{
if(ValueTemp[0]!="")
HtmlParTemp.TagAttribute.Add(ValueTemp[0].ToString(), ValueTemp[1].ToString().Replace(">",""));
}
catch (Exception e)
{
}
}
else
{
if (Regex.IsMatch(ValueTemp[0],"<",RegexOptions.IgnoreCase))
HtmlParTemp.TagName = ValueTemp[0];
if (!Regex.IsMatch(HtmlParTemp.TagName, ">", RegexOptions.IgnoreCase))
HtmlParTemp.TagName += ">";
}
}
HtmlParTemp.StarPos = starPos;
MyList.Add(HtmlParTemp);
Count = MyList.Count;
}
//分析属性
//获取块结尾
private void setEndPos(int endPos,string EndHtml)
{
EndHtml = EndHtml.Replace("/", "");
if(!Regex.IsMatch(EndHtml,">",RegexOptions.IgnoreCase))
EndHtml = EndHtml.Replace(" ", "")+">";
int PostArrayI=-1;
for (int i = 0; i < MyList.Count; i++)
{
if (((HtmlParSer)MyList[i]).TagName == EndHtml & ((HtmlParSer)MyList[i]).endTagName=="0")
{
PostArrayI = i;
}
}
if (PostArrayI >= 0)
{
((HtmlParSer)MyList[PostArrayI]).EndPos = endPos;
((HtmlParSer)MyList[PostArrayI]).endTagName = EndHtml.Replace("<","</");
}
}
//获取块结尾
//获得标签名
private void returnTag(int i)
{
string TagHtml = "";
bool get = true;
for (int j = i; j < Htmls.Length; j++)
{
if (j + 1 < Htmls.Length)
{
if (Htmls[j] == '/')
{
if (Htmls[j - 1] == '<')
get = false;
}
else
{
if (Htmls[j] == '>')
{
break;
}
}
}
TagHtml += Htmls[j];
}
if (get)
GetAttribute(i, TagHtml);
else
{
setEndPos(i + TagHtml.Length, TagHtml);
}
}
//获取标签名
}
}
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Collections;
using System.Text.RegularExpressions;
using System.IO;
namespace HtmlPerTJ
{
class HtmlParSer
{
public string TagName;//标签名;
public SortedList TagAttribute = new SortedList();//属性;
public int StarPos;//开始字符数;
public int EndPos;//结束定位数;
public string endTagName="0";//结束标签
}
class HtmpPerSerClass
{
public string Html;//文件流对像;
public char[] Htmls;//文件流对像数组;
private int HtmlEndPos=0;//文件结尾指针;
public ArrayList MyList = new ArrayList();//保存HTML标签流
public int Count = 0;
//初始化
public void CreateHtmlPerSer(string Str)
{
Html = Str;
Htmls = Str.ToCharArray();
HtmlEndPos = Htmls.Length;
for (int i = 0; i < Htmls.Length; i++)
{
if(Htmls[i]=='<')
returnTag(i);
}
}
//初始化
public string getConten(string tagName)
{
ArrayList ContentList = new ArrayList();
ContentList = getTag(tagName);
int MaxLength = 0;
int GetStrStar = 0;
int GetStrEnd = 0;
/*
for (int j = 0; j < ContentList.Count; j++)
{
for (int c = 0; c < ContentList.Count; c++)
{
if (((HtmlParSer)ContentList[j]).StarPos > ((HtmlParSer)ContentList[c]).StarPos & ((HtmlParSer)ContentList[j]).EndPos <((HtmlParSer)ContentList[c]).EndPos)
ContentList.Remove(ContentList[j]);
}
}
*/
for (int i = 0; i < ContentList.Count; i++)
{
int stard = ((HtmlParSer)ContentList[i]).StarPos;
int endHtml = ((HtmlParSer)ContentList[i]).EndPos;
if (endHtml - stard > MaxLength)
{
MaxLength = endHtml - stard;
GetStrStar = stard;
GetStrEnd = endHtml;
}
}
return Regex.Replace(Html.Substring(GetStrStar, GetStrEnd - GetStrStar + 1), @"<[^>]+>|\n[\s| ]*\r|(^\s*)|(\s*$)|\r\n| | ", "", RegexOptions.IgnoreCase);
}
//获取TAG
public ArrayList getTag(string TagNames)
{
string[] TagName = TagNames.Split(',');
ArrayList TempList = new ArrayList();
for (int TagNameCount = 0; TagNameCount < TagName.Length; TagNameCount++)
{
if (!Regex.IsMatch(TagName[TagNameCount], "<", RegexOptions.IgnoreCase))
TagName[TagNameCount] = "<" + TagName[TagNameCount];
if (!Regex.IsMatch(TagName[TagNameCount], ">", RegexOptions.IgnoreCase))
TagName[TagNameCount] = TagName[TagNameCount] + ">";
TagName[TagNameCount] = TagName[TagNameCount].ToLower();
for (int i = 0; i < MyList.Count; i++)
{
if (((HtmlParSer)MyList[i]).TagName == TagName[TagNameCount])
{
TempList.Add(MyList[i]);
}
}
}
return TempList;
}
//获取TAG
//分析属性
private void GetAttribute(int starPos,string AttStr)
{
HtmlParSer HtmlParTemp = new HtmlParSer();
HtmlParTemp.EndPos = starPos+AttStr.Length;
AttStr = Regex.Replace(AttStr,"\"|'","",RegexOptions.IgnoreCase);
if(!Regex.IsMatch(AttStr," >",RegexOptions.IgnoreCase))
AttStr = Regex.Replace(AttStr, ">", " >", RegexOptions.IgnoreCase);
AttStr = Regex.Replace(AttStr, " ", "|", RegexOptions.IgnoreCase);
AttStr = AttStr.Replace("|>",">");
string[] AttValueTemp=AttStr.Split('|');
for (int i = 0; i < AttValueTemp.Length; i++)
{
string[] ValueTemp = AttValueTemp[i].Split('=');
if (!(ValueTemp.Length != 2))
{
try
{
if(ValueTemp[0]!="")
HtmlParTemp.TagAttribute.Add(ValueTemp[0].ToString(), ValueTemp[1].ToString().Replace(">",""));
}
catch (Exception e)
{
}
}
else
{
if (Regex.IsMatch(ValueTemp[0],"<",RegexOptions.IgnoreCase))
HtmlParTemp.TagName = ValueTemp[0];
if (!Regex.IsMatch(HtmlParTemp.TagName, ">", RegexOptions.IgnoreCase))
HtmlParTemp.TagName += ">";
}
}
HtmlParTemp.StarPos = starPos;
MyList.Add(HtmlParTemp);
Count = MyList.Count;
}
//分析属性
//获取块结尾
private void setEndPos(int endPos,string EndHtml)
{
EndHtml = EndHtml.Replace("/", "");
if(!Regex.IsMatch(EndHtml,">",RegexOptions.IgnoreCase))
EndHtml = EndHtml.Replace(" ", "")+">";
int PostArrayI=-1;
for (int i = 0; i < MyList.Count; i++)
{
if (((HtmlParSer)MyList[i]).TagName == EndHtml & ((HtmlParSer)MyList[i]).endTagName=="0")
{
PostArrayI = i;
}
}
if (PostArrayI >= 0)
{
((HtmlParSer)MyList[PostArrayI]).EndPos = endPos;
((HtmlParSer)MyList[PostArrayI]).endTagName = EndHtml.Replace("<","</");
}
}
//获取块结尾
//获得标签名
private void returnTag(int i)
{
string TagHtml = "";
bool get = true;
for (int j = i; j < Htmls.Length; j++)
{
if (j + 1 < Htmls.Length)
{
if (Htmls[j] == '/')
{
if (Htmls[j - 1] == '<')
get = false;
}
else
{
if (Htmls[j] == '>')
{
break;
}
}
}
TagHtml += Htmls[j];
}
if (get)
GetAttribute(i, TagHtml);
else
{
setEndPos(i + TagHtml.Length, TagHtml);
}
}
//获取标签名
}
}
[1]使用SgmlReader规范化页面,使符合XML规范
[2]使用xpath语法,任意处置标签其它说明:
[1]如果面页本身是规范的,可跳过第一步
[2]SgmlReader也不是尽善尽美,格式化出来的XML有时候嵌套关系会搞错,但即使这样,根据他格式化出来的文本,用来提取数据也足够了。当我有很多的替代方案。
[3]简单规则可以考虑正则,效率高一些。
string tag = @"(?:[\w-:]+)";
string attribute = @"(?:[\w-:]+)(?:=(?:[^\s\>\<]*|\""[\s\S]*?\""|\'[\s\S]*?\'))?";
string name = @"(?:[\w-:]+)";
string argument = @"(?:[\w-:]+|\""[\s\S]*?\""|\'[\s\S]*?\')";
string beginningTag = @"(?:\<" + tag + @"(?:\s+" +attribute + @")*\s*(?:/)?\>)";
string endingTag = @"(?:\</" + tag + @"\>)";
string xmlComment = @"(?:\<!--[\s\S]*?--\>)";
string xmlDirective = @"(?:\<!" +name + @"(?:\s+" +argument + @")*\s*\>)";
string xmlCData = @"(?:\<!\[CDATA\[(?:[\s\S]*?)\]\]\>)";
string styleBlock = @"(?:(?:\<(?:Style)(?:\s+" +attribute + @")*\s*(?:/)?\>)(?:[\s\S]*?)(?:\</(?:Style)\>))";
string scriptBlock = @"(?:(?:\<(?:script)(?:\s+" +attribute + @")*\s*(?:/)?\>)(?:[\s\S]*?)(?:\</(?:script)\>))";
string xmlLiteral = @"(?:(?:(?<blank>[ ]+)|[^ \<\>])+)"; public static string getTitle(string strHtml)
{
string title="";
Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase);
Match mc = reg.Match(strHtml);
if (mc.Success)
title = mc.Groups["title"].Value.Trim();
return title;
}