using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Net; using System.IO; using System.Text.RegularExpressions;namespace ConsoleApplication14 { class Program { static void Main(string[] args) { string pattern = @"(?<em>\w+([-+.']\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*)"; Regex ex = new Regex(pattern); HttpWebRequest wr = (HttpWebRequest)HttpWebRequest.Create(new Uri("http://localhost/test.html")); wr.Method = WebRequestMethods.Http.Get; HttpWebResponse response = (HttpWebResponse)wr.GetResponse(); StreamReader sr = new StreamReader(response.GetResponseStream()); string line = sr.ReadLine(); while (line != null) { MatchCollection mc = ex.Matches(line, 0); foreach (Match match in mc) { Console.WriteLine(match.Groups["em"].Value); } line = sr.ReadLine(); } sr.Close(); response.Close(); } } }
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;namespace ConsoleApplication14
{
class Program
{
static void Main(string[] args)
{
string pattern = @"(?<em>\w+([-+.']\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*)";
Regex ex = new Regex(pattern);
HttpWebRequest wr = (HttpWebRequest)HttpWebRequest.Create(new Uri("http://localhost/test.html"));
wr.Method = WebRequestMethods.Http.Get;
HttpWebResponse response = (HttpWebResponse)wr.GetResponse();
StreamReader sr = new StreamReader(response.GetResponseStream());
string line = sr.ReadLine();
while (line != null)
{
MatchCollection mc = ex.Matches(line, 0);
foreach (Match match in mc)
{
Console.WriteLine(match.Groups["em"].Value);
}
line = sr.ReadLine();
}
sr.Close();
response.Close();
}
}
}
private void DownHtmlAndEmail(string url)
{
//抓取网页内容
string ContentHtml=String.Empty;
HttpWebRequest rt=null;
HttpWebResponse rs=null;
Stream stream=null;
StreamReader sr=null;
rt=(HttpWebRequest)WebRequest.Create(url);
rs=(HttpWebResponse)rt.GetResponse();
stream=rs.GetResponseStream();
Encoding encoding = Encoding.Default;
sr=new StreamReader(stream,encoding);
ContentHtml=sr.ReadToEnd(); //将读取出来的全部URL写入文本文件
string EmailFileName=Application.StartupPath+"\\HttpPageEmail.txt"; //创建文本文档
StreamWriter strwriterobj=File.CreateText(EmailFileName);//创建写入流
//用正则表达式识别Email地址
Regex EmailRegex = new Regex(@"([a-zA-Z_0-9.-]+\@[a-zA-Z_0-9.-]+\.\w+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
MatchCollection matches = EmailRegex.Matches(ContentHtml);
MessageBox.Show(ContentHtml);
foreach (Match match in matches)
{
this.listemail.Items.Add(match.Value.ToString());//将数据添加到ListBox
strwriterobj.WriteLine(match.Value.ToString());//将数据写入文件
}
MessageBox.Show("抓取完毕!!!");
strwriterobj.Close();
sr.Close();
stream.Close();
rs.Close();
}
这是我的源码我不知道是页面没有邮箱还是根本就没抓到,listbox里面没有记录
不好意思,在这里我只是提供一个思路,正则我是从web application的RegularExpressionValidator控件的ValidationExpression中Ctrl+C出来的,没有去考究是否正确。不过只要思路没错应该就没问题。正则楼主可以根据自己的需求进行修改。
有可能是页面没有邮箱地址,你可以在本机搭个IIS,然后在wwwroot下新建一个xxx.html文件,并往里面写个邮件地址,例如:<a href="mailto:[email protected]">Send</a>。然后,在程序中使用http://localhost/xxx.html试试看,看能否抓出[email protected]。如果不能抓出,就要分析一下你的正则表达式了。