http://community.csdn.net/Expert/topic/5243/5243460.xml?temp=.4505731String s = "<img src=\"image/icons/cloudday_small.gif\" width=\"20\" height=\"20\">dfsdf<img src=\"image/icons/cloudday_454.gif\" width=\"20\" height=\"20\">";
Matcher m = Pattern.compile("src=\"?(.*?)(\"|>|\\s+)").matcher(s);
while(m.find())
{
System.out.println(m.group(1));
}
Matcher m = Pattern.compile("src=\"?(.*?)(\"|>|\\s+)").matcher(s);
while(m.find())
{
System.out.println(m.group(1));
}
<img src="http://zi.csdn.net/intel_120x60.gif" />
<img src="http://localhost:80/csdn.net/intel_120x60.gif" />这两种形式就不行拉,而且上面只对src进行提取如果是
<xxx src="xxxx.jpg" /> 这样也会提取出xxxx.jpg
能不能验证前面是img标签然后在提取img里面得src地址
.compile("((http|https|ftp|rtsp|mms):(//|\\\\){1}((([A-Za-z0-9_-])+[.])|([A-Za-z0-9_-.])+[.]){1,}(net|com|cn|org|cc|tv|[0-9]{1,3})(\\S*/)((\\S)+[.]{1}(jpg|jpeg|gif|png){1}))");//取得网页上URL的正则表达式
上面只能取到
<IMG src="http://192.168.0.155/include/webEditor/uploadfile/20061229014444255.jpg" border=0>
这种格式的路径
如果是加上端口就取不到了
<IMG src="http://192.168.0.155:888/include/webEditor/uploadfile/20061229014444255.jpg" border=0>请高手解答
String html = "<html>\r\n" +
"<head><title>test</title><head>\r\n" +
"<body>" +
"<P><IMG height=\"100\" src='abc.png' weight=\"30\">abcdefg" +
"<img src='http://abc.xyz.com/123/456.jpg' /><br>" +
"<IMG height=\"100\" \r\n" +
" src=\"abc.jpg\" \r\n" +
" weight=\"30\">abcdefg \r\n" +
" <img src=ttt.jpg>" +
" <img src=123.jpg />" +
// "<img alt=\"src='abc'\">" + //这种我也无能为力
"</body></html>";
System.out.println(getImgSrc(html));
} public static final Pattern PATTERN = Pattern.compile("<img\\s+(?:[^>]*)src\\s*=\\s*([^>]+)", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
public static List getImgSrc(String html) {
Matcher matcher = PATTERN.matcher(html);
List list = new ArrayList();
while (matcher.find()) {
String group = matcher.group(1);
if (group == null) {
continue;
}
// 这里可能还需要更复杂的判断,用以处理src="...."内的一些转义符
if (group.startsWith("'")) {
list.add(group.substring(1, group.indexOf("'", 1)));
} else if (group.startsWith("\"")) {
list.add(group.substring(1, group.indexOf("\"", 1)));
} else {
list.add(group.split("\\s")[0]);
}
}
return list;
}
这样写就可以加端口
IMG[\s\S]*?src=\"?((http|https|ftp|rtsp|mms):(//|\\\\){1}((([A-Za-z0-9_-:])+[.])|([A-Za-z0-9_-.:])+[.]){1,}(net|com|cn|org|cc|tv|[0-9]{1,3})(\S*/)((\S)+[.]{1}(jpg|jpeg|gif|png){1}))(\"|>|\\s+)可以取到<IMG src="http://192.168.0.155:8888/include/webEditor/uploadfile/20061229014444255.jpg" border=0>中的20061229014444255.jpg($10)
import java.util.regex.*;// 表达式对象
Pattern p = Pattern.compile("IMG[\\s\\S]*?src=\\\"?((http|https|ftp|rtsp|mms):(//|\\\\\\\\){1}((([A-Za-z0-9_-:])+[.])|([A-Za-z0-9_-.:])+[.]){1,}(net|com|cn|org|cc|tv|[0-9]{1,3})(\\S*/)((\\S)+[.]{1}(jpg|jpeg|gif|png){1}))(\\\"|>|\\\\s+)");// 创建 Matcher 对象
Matcher m = p.matcher("your string");// 是否找到匹配
boolean found = m.find();if( found )
{
String foundstring = m.group();
int beginPos = m.start();
int endPos = m.end();
}