估计应该是用正则去分析html(不用正则估计会很费劲),功能类似于网站采集软件“火车头”那样的,但是没有那么复杂。我把我的需求说一下吧,大家帮忙看看有没有成熟的框架可用,如果有我就在框架基础上修改算了,重头做太过复杂感觉。1.检查网址是否正常 返回200和60为正常 404为故障 301和302为跳转。 左面列出网站地址,右面对应着相应网站的状态2.取到一些网站的文章标题以及文章内容
PS:取文章中比较复杂,P表示段落,整合到程序中要处理换行。3.分析百度,比如输入一个关键词,我要得到三部分内容,
1)搜出来的那些普通数据,
2)以及顶端的推广内容
3)右侧的推广内容
PS:取文章中比较复杂,P表示段落,整合到程序中要处理换行。3.分析百度,比如输入一个关键词,我要得到三部分内容,
1)搜出来的那些普通数据,
2)以及顶端的推广内容
3)右侧的推广内容
[email protected]
_html = new HtmlDocument();
_html.LoadHtml("www.baidu.com"); hapTree.BaseNode = _html.DocumentNode;
但是winform下的treeview中并没有BaseNode属性,我应该怎么赋值给tree的哪个属性呢?
我想先看看效果 再继续研究!!
public partial class NodeTreeView : TreeView
{
private HtmlNode baseNode;
public NodeTreeView()
{
InitializeComponent();
} public HtmlNode BaseNode
{
get { return baseNode; }
set { baseNode = value;
PopulateTreeview();
}
} private TreeViewItem BuildTree(HtmlNode htmlNode)
{
//Create the main treeview node for this htmlnode
var item = new TreeViewItem { DataContext = htmlNode }; //preserve reference to _html node for databinding //if we have psuedo element, show it's text
if (htmlNode.NodeType == HtmlNodeType.Text || htmlNode.NodeType == HtmlNodeType.Comment)
item.Header = string.Format("<{0}> = {1}", htmlNode.OriginalName, htmlNode.InnerText.Trim());
else
item.Header = string.Format("<{0}>", htmlNode.OriginalName); //Create Attribute collection
PopulateItem(htmlNode, item); return item;
}
private void PopulateItem(HtmlNode htmlNode, ItemsControl item)
{
var attributes = new TreeViewItem { Header = "Attributes" };
foreach (var att in htmlNode.Attributes)
attributes.Items.Add(new TreeViewItem
{
Header = string.Format("{0} = {1}", att.OriginalName, att.Value),
DataContext = att
});
//If we don't have any attributes, don't add the node
if (attributes.Items.Count > 0)
item.Items.Add(attributes); //Create the Elements Collection
var elements = new TreeViewItem { Header = "Elements", DataContext = htmlNode };
foreach (var node in htmlNode.ChildNodes)
{
//If there are no attributes, no need to add a node inbetween the parent in the treeview
if (attributes.Items.Count > 0)
elements.Items.Add(BuildTree(node));
else
item.Items.Add(BuildTree(node));
} //If there are no nodes in the elements collection, don't add to the parent
if (elements.Items.Count > 0)
item.Items.Add(elements);
}
public void PopulateTreeview()
{
this.Items.Clear();
var header = baseNode.NodeType == HtmlNodeType.Document ? "DocumentElement" : baseNode.OriginalName;
//We create the base node here, that way as new nodes are added we can animate them ;)
var document = new TreeViewItem { Header = header, DataContext = baseNode, };
this.Items.Add(document);
PopulateItem(baseNode, document);
}
}
public class NodeTreeView : System.Windows.Controls.TreeView
{
private HtmlNode baseNode;
public NodeTreeView()
{
InitializeComponent();
} public HtmlNode BaseNode
{
get { return baseNode; }
set
{
baseNode = value;
PopulateTreeview();
}
} public void PopulateTreeview()
{
this.Items.Clear();
var header = baseNode.NodeType == HtmlNodeType.Document ? "DocumentElement" : baseNode.OriginalName;
//We create the base node here, that way as new nodes are added we can animate them ;)
var document = new System.Windows.Controls.TreeViewItem { Header = header, DataContext = baseNode, };
this.Items.Add(document);
PopulateItem(baseNode, document);
}
}是继承PresentationFramework.dll 中的System.Windows.Controls.TreeView
不是winform里那个tree...而且这个类就算写好了 怎么加到工具箱中呢 饿。。 好好的弄什么wpf版本呢 哎
这个类是wpf的 官方示例中倒是能拖出来
太繁杂了 要是winform中的treeview中没有这个属性 暂时就不研究这个了 从简单地方入手吧 越看越闹心这代码
{
protected void Page_Load(object sender, EventArgs e)
{
string Html = GetWebHtml("http://www.cnblogs.com/pick/", null);
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(Html);
Response.Write(doc);
} /// <summary>
/// 获取指定URL的HTML源代码
/// </summary>
/// <param name="url"></param>
/// <param name="encoding">如果为NULL 则自动识别</param>
/// <returns></returns>
public static string GetWebHtml(string url, Encoding encoding)
{
try
{
HttpWebRequest hwr = (HttpWebRequest)HttpWebRequest.Create(url);
HttpWebResponse res; try
{
res = (HttpWebResponse)hwr.GetResponse();
}
catch
{
return string.Empty;
} if (res.StatusCode == HttpStatusCode.OK)
{
using (Stream mystream = res.GetResponseStream())
{
//没有指定编码,
if (encoding == null)
{
return DecodeData(mystream, res);
}
//指定了编码
else
{
using (StreamReader reader = new StreamReader(mystream, encoding))
{
return reader.ReadToEnd();
}
}
}
} return null;
}
catch
{
return null;
}
}
private static string DecodeData(Stream responseStream, HttpWebResponse response)
{
string name = null;
string text2 = response.Headers["content-type"];
if (text2 != null)
{
int index = text2.IndexOf("charset=");
if (index != -1)
{
name = text2.Substring(index + 8);
}
}
MemoryStream stream = new MemoryStream();
byte[] buffer = new byte[0x400];
for (int i = responseStream.Read(buffer, 0, buffer.Length); i > 0; i = responseStream.Read(buffer, 0, buffer.Length))
{
stream.Write(buffer, 0, i);
}
responseStream.Close();
if (name == null)
{
MemoryStream stream3 = stream;
stream3.Seek((long)0, SeekOrigin.Begin);
string text3 = new StreamReader(stream3, Encoding.ASCII).ReadToEnd();
if (text3 != null)
{
int startIndex = text3.IndexOf("charset=");
int num4 = -1;
if (startIndex != -1)
{
num4 = text3.IndexOf("\"", startIndex);
if (num4 != -1)
{
int num5 = startIndex + 8;
name = text3.Substring(num5, (num4 - num5) + 1).TrimEnd(new char[] { '>', '"' });
}
}
}
}
Encoding aSCII = null;
if (name == null)
{
aSCII = Encoding.GetEncoding("gb2312");
}
else
{
try
{
if (name == "GBK")
{
name = "GB2312";
}
aSCII = Encoding.GetEncoding(name);
}
catch
{
aSCII = Encoding.GetEncoding("gb2312");
}
}
stream.Seek((long)0, SeekOrigin.Begin);
StreamReader reader2 = new StreamReader(stream, aSCII);
return reader2.ReadToEnd();
}
}
看看又没有类似InnerHTML的属性 Response.Write(doc.InnerHTML);
{
#region Fields private int _c;
private Crc32 _crc32;
private HtmlAttribute _currentattribute;
private HtmlNode _currentnode;
private Encoding _declaredencoding;
private HtmlNode _documentnode;
private bool _fullcomment;
private int _index;
internal Hashtable _lastnodes = new Hashtable();
private HtmlNode _lastparentnode;
private int _line;
private int _lineposition, _maxlineposition;
internal Hashtable _nodesid;
private ParseState _oldstate;
private bool _onlyDetectEncoding;
internal Hashtable _openednodes;
private List<HtmlParseError> _parseerrors = new List<HtmlParseError>();
private string _remainder;
private int _remainderOffset;
private ParseState _state;
private Encoding _streamencoding;
internal string _text;
}
太神奇了 调试的时候自动生成了好多js文件 jquery都有啊 都是从那个网址上采集过来的么?
中途有几处js代码异常 属于正常情况吧 界面上来看的话 和源地址还是蛮像的(只是生成的东西没有源地址漂亮 还有那些字的位置有些变化),大概要的就是这个效果吧 只有_text字段有内容 其它的内容都非常少
显示在winform界面中(richtext中显示 活着 datagridview中显示吧) 我说的这个功能 在这份代码基础上 实现起来合适么?