我想编写个程序把http://bbs.fengniao.com/forum/2159500.html中的图片的URL提取出来 using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Net; using System.IO; using System.Text.RegularExpressions; namespace ConsoleApplication1 { class Program { static void Main(string[] args) { StringBuilder strSource = new StringBuilder(""); string webAddress = "http://bbs.fengniao.com/forum/2159500.html";
WebRequest WReq = WebRequest.Create(webAddress);//对URl地址发出请求 WebResponse WResp = WReq.GetResponse();//返回服务器的响应 StreamReader sr = new StreamReader(WResp.GetResponseStream(), Encoding.UTF8);//从数据流中读取数据 string strTemp = ""; while ((strTemp = sr.ReadLine()) != null)//循环读出数据 { strSource.Append(strTemp + "\r\n");//把数据添加到字符串中 string pictureurl; Regex reg= new Regex("<a\\b[^<>]*?\\b\\b[^=]*?\\b(?<imgurl>[^\\s\\t\\r\\n\"\"'<>]*)\\b[^&]*?\\bjpg'\\)\" /></a> "); MatchCollection mc = reg.Matches(strTemp); foreach (Match m in mc) { pictureurl = m.Groups["imgurl"].Value;//如何获得图片地址? Console.WriteLine(pictureurl); Console.WriteLine(strTemp+"\n\n\n"); }
foreach(Match m in Regex.Matches(yourHtml,@"(?isn)楼</span>((?!<img).)+?<img.+?src=""(?<image_url>http://(?!bbs\.fengniao\.com)[^""]+)") { //m.Groups["image_url"].Value就是你要的 } 至于多的2张图。你这个自己人工去除吧。软件干预过多可能有些有用的也删掉了。不看图。就看图片网址,我看不出有什么规律通用所有帖子,你可以想明白,如果有更好规律总结,留言给你继续按规则写。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
namespace ConsoleApplication1
{
class Program
{
static void Main(string[] args)
{
StringBuilder strSource = new StringBuilder("");
string webAddress = "http://bbs.fengniao.com/forum/2159500.html";
WebRequest WReq = WebRequest.Create(webAddress);//对URl地址发出请求
WebResponse WResp = WReq.GetResponse();//返回服务器的响应
StreamReader sr = new StreamReader(WResp.GetResponseStream(), Encoding.UTF8);//从数据流中读取数据
string strTemp = "";
while ((strTemp = sr.ReadLine()) != null)//循环读出数据
{
strSource.Append(strTemp + "\r\n");//把数据添加到字符串中
string pictureurl; Regex reg= new Regex("<a\\b[^<>]*?\\b\\b[^=]*?\\b(?<imgurl>[^\\s\\t\\r\\n\"\"'<>]*)\\b[^&]*?\\bjpg'\\)\" /></a> ");
MatchCollection mc = reg.Matches(strTemp);
foreach (Match m in mc)
{
pictureurl = m.Groups["imgurl"].Value;//如何获得图片地址?
Console.WriteLine(pictureurl);
Console.WriteLine(strTemp+"\n\n\n");
}
}
sr.Close();
}
}
}正则表达式<a\\b[^<>]*?\\b\\b[^=]*?\\b(?<imgurl>[^\\s\\t\\r\\n\"\"'<>]*)\\b[^&]*?\\bjpg'\\)\" /></a> 匹配正常 能得到<a href="http://bbs.fengniao.com/forum/pic/slide.php?id=125_2159500_41372532" title="昨天拍到的一处小景! " target="_blank"><img src="http://img3.fengniao.com/forum/attachpics/461/78/18415550.jpg" border="0" alt="昨天拍到的一处小景! " onload="javascript: if((document.body.offsetWidth-this.width)<175) {this.width=800; this.alt='点击浏览原图\r\n'}" ondblclick="javascript:window.open('http://img3.fengniao.com/forum/attachpics/461/78/18415550.jpg')" /></a>
可是 pictureurl = m.Groups["imgurl"].Value;//如何获得图片地址?
没办法得到http://img3.fengniao.com/forum/attachpics/461/78/18415550.jpg
我的意思是自己没办法提取到其中的图片,只能提取到这整个代码
{
//m.Groups["image_url"].Value就是你要的
}
至于多的2张图。你这个自己人工去除吧。软件干预过多可能有些有用的也删掉了。不看图。就看图片网址,我看不出有什么规律通用所有帖子,你可以想明白,如果有更好规律总结,留言给你继续按规则写。
(1)获取http://bbs.fengniao.com/forum/2159500.html源码
(2)对于源码一行一行分析,用我的正则表达式匹配Regex reg= new Regex("<a\\b[^<>]*?\\b\\b[^=]*?\\b(?<imgurl>[^\\s\\t\\r\\n\"\"'<>]*)\\b[^&]*?\\bjpg'\\)\" /></a> ");
(3)能获得<a href="http://bbs.fengniao.com/forum/pic/slide.php?id=125_2159500_41372532" title="昨天拍到的一处小景! " target="_blank"><img src="[color=#FF0000]http://img3.fengniao.com/forum/attachpics/461/78/18415550.jpg" border="0" alt="昨天拍到的一处小景! " onload="javascript: if((document.body.offsetWidth-this.width)<175) {this.width=800; this.alt='点击浏览原图\r\n'}" ondblclick="javascript:window.open('http://img3.fengniao.com/forum/attachpics/461/78/18415550.jpg')" /></a> [/color]
(4)我想得到红色的图片链接
(1)获取http://bbs.fengniao.com/forum/2159500.html源码
(2)对于源码一行一行分析,用我的正则表达式匹配Regex reg= new Regex("<a\\b[^<>]*?\\b\\b[^=]*?\\b(?<imgurl>[^\\s\\t\\r\\n\"\"'<>]*)\\b[^&]*?\\bjpg'\\)\" /></a> ");
(3)能获得<a href="http://bbs.fengniao.com/forum/pic/slide.php?id=125_2159500_41372532" title="昨天拍到的一处小景! " target="_blank"><img src="http://img3.fengniao.com/forum/attachpics/461/78/18415550.jpg" border="0" alt="昨天拍到的一处小景! " onload="javascript: if((document.body.offsetWidth-this.width)<175) {this.width=800; this.alt='点击浏览原图\r\n'}" ondblclick="javascript:window.open('http://img3.fengniao.com/forum/attachpics/461/78/18415550.jpg')" /></a>
(4)我想得到图片链接http://img3.fengniao.com/forum/attachpics/461/78/18415550.jpg
(5)我觉得问题可能出在组的引用上,我的这个正则表达式很不规范,是这几天学的,不过用RegexBuddy测试的时候是没问题的
Regex reg= new Regex("<a\\b[^<>]*?\\b\\b[^=]*?\\b(?<imgurl>[^\\s\\t\\r\\n\"\"'<>]*)\\b[^&]*?\\bjpg'\\)\" /></a> ");
替换为Regex reg= new Regex("(?<=img src=\").*?jpg");就可以了
用c#测试,我贴给你的是代码。不是正则字符串。如果用工具测试,连续的""替换为"再测试。我给你的是测试过的,虽然手上没开发工具。但用我自己工具测试过的。结果
http://img3.fengniao.com/forum/attachpics/461/78/18415550.jpg
http://img3.fengniao.com/forum/attachpics/461/78/18415551.jpg
http://img3.fengniao.com/forum/attachpics/461/78/18415552.jpg
http://img3.fengniao.com/forum/attachpics/461/78/18415553.jpg
http://img3.fengniao.com/forum/attachpics/461/78/18415554.jpg
http://img3.fengniao.com/forum/customavatars/avatar386212_5.gif
http://pic.zol-img.com.cn/2011/05/1304989387.jpg
WebRequest WReq = WebRequest.Create(webAddress);//对URl地址发出请求
WebResponse WResp = WReq.GetResponse();//返回服务器的响应
StreamReader sr = new StreamReader(WResp.GetResponseStream(), Encoding.UTF8);//从数据流中读取数据
string strTemp = "";
while ((strTemp = sr.ReadLine()) != null)//循环读出数据
{
strSource.Append(strTemp + "\r\n");//把数据添加到字符串中
string pictureurl = "";
bool FoundMatch = false; //通过正则表达式判断字符串是否合乎要求
try
{
//正则匹配包含图片链接的字符串
FoundMatch = Regex.IsMatch(strTemp, "\\A<a\\b[^<>]*?\\b_blank\"><img\\b[^=]*?\\b=\"(?<imgUrl>[^\\s\\t\\r\\n\"\"'<>]*)\\b[^&]*?\\bjpg'\\)\" /></a> \\z");
}
catch (ArgumentException ex)
{
// Syntax error in the regular expression
}
if (FoundMatch) //获取字符串中的图片地址
{
Regex RegexObj = new Regex(@"\b(?<imgUrl>[^\s\t\r\n""'<>]*)\b", RegexOptions.Compiled);//正则匹配图片链接
MatchCollection matches = RegexObj.Matches(strTemp);
foreach (Match match in matches)
{
bool Judge = false; Judge = Regex.IsMatch(match.Groups["imgUrl"].Value, "\\Ahttp://img3.fengniao.com\\b([^\\s\\t\\r\\n\"\"'<>]*)\\b.jpg\\z"); if (Judge) //判断是否为图片地址
{
pictureurl = match.Groups["imgUrl"].Value; //图片地址
} }
ImageUrl[Number] = pictureurl; //将图片地址放入数组 Regex RegexName = new Regex(@"\b(?<imgUrl>[\u4E00-\u9FFF]+)\b", RegexOptions.Compiled);
MatchCollection matchesName = RegexObj.Matches(strTemp);
int i = 0; //用来获取中文字符 foreach (Match match in matchesName)
{
if (i == 8)
{
//图片名字
if (match.Groups["imgUrl"].Value == textBox3.Text.Trim())
ImageName[Number] = match.Groups["imgUrl"].Value + Number;
else
ImageName[Number] = match.Groups["imgUrl"].Value;
} i++; }这样就可以了