<div class="moreimages">
<table width="750" border="0" cellspacing="0" cellpadding="0">
<tbody>
<tr>
<td align="center">
<table width="750" border="0"
cellspacing="0" cellpadding="0">
<tbody>
<tr>
<td><img
src="http://images.vjia.com/ShopPic/20121008/20121008151836_8158.jpg"
width="750" height="49" /></td>
</tr>
</tbody>
</table>
<table border="0" cellpadding="0" cellspacing="0" width="700">
<tbody>
<tr>
<td colspan="3"><img name="MOTO_r1_c1"
src="http://images.vjia.com/Others/2012/12/20/20121220145602_0266.jpg"
width="700" height="141" border="0" id="MOTO_r1_c1" alt="" />
</td>
</tr>
<tr>
<td rowspan="2"><img name="MOTO_r2_c1"
src="http://images.vjia.com/Others/2012/12/20/20121220145617_5644.jpg"
width="377" height="89" border="0" id="MOTO_r2_c1" alt="" /></td>
<td colspan="2"><a
href="http://mall.vjia.com/magaseek/Custom/content_2911"
target="_blank"> <img name="MOTO_r2_c2"
src="http://images.vjia.com/Others/2012/12/20/20121220145635_4267.jpg"
width="323" height="31" border="0" id="MOTO_r2_c2" alt="" /></a>
</td>
</tr>
<tr>
<td><img name="MOTO_r3_c2"
src="http://images.vjia.com/Others/2012/12/20/20121220145646_5340.jpg"
width="73" height="58" border="0" id="MOTO_r3_c2" alt="" /></td>
<td><table width="250" height="58" border="0"
cellpadding="0" cellspacing="0">
<tbody>
<tr>
<td
background="http://images.vjia.com/ShopPic/20121015/20121015141810_5606.jpg"><span>M</span>
</td>
</tr>
</tbody>
</table></td>
</tr>
</tbody>
</table>
<table width="750" border="0" cellspacing="0" cellpadding="0">
<tbody>
<tr>
<td><img
src="http://images.vjia.com/ShopPic/20121009/20121009101912_1819.jpg"
width="750" height="49" border="0" usemap="#Map" /></td>
</tr>
</tbody>
</table>
<table width="700" border="0" cellspacing="0" cellpadding="0">
<tbody>
<tr>
<td><em> <img
src="http://images.vjia.com/Others/2013/3/6/__aW1nMDI=__20130306154258_5958.jpg"
alt="" /><br />
<img
src="http://images.vjia.com/Others/2013/3/6/__aW1nMDI=__20130306154311_3878.jpg"
alt="" /><br />
<img
src="http://images.vjia.com/Others/2013/3/6/__aW1nMDI=__20130306154318_9070.jpg"
alt="" /><br /></em>
<p>
<em><img
src="http://images.vjia.com/Others/2013/3/6/__aW1nMDI=__20130306154325_4746.jpg"
alt="" /> <img
src="http://images.vjia.com/Others/2013/3/6/__aW1nMDI=__20130306154332_7130.jpg"
alt="" /></em>
</p>
<p>
<em><img
src="http://images.vjia.com/Others/2013/3/6/__aW1nMDI=__20130306154341_1994.jpg"
alt="" /><br /></em>
</p>
<p>
<em><img
src="http://images.vjia.com/Others/2013/3/6/__aW1nMDI=__20130306154354_0226.jpg"
alt="" /><br /></em>
</p>
<p>
<em><img
src="http://images.vjia.com/Others/2013/3/6/__aW1nMDI=__20130306154409_2950.jpg"
alt="" /><br /></em>
</p></td>
</tr>
</tbody>
</table>
<table width="750" border="0" cellspacing="0" cellpadding="0">
<tbody>
<tr>
<td><img
src="http://images.vjia.com/ShopPic/20121009/20121009104747_8492.jpg"<!--这张图片的标记是唯一的,可以由这张图片判断位置-->
width="750" height="49" /></td>
</tr>
</tbody>
</table>
<table>
<tbody>
<tr>
<td align="center">
<table width="700" border="0"
cellspacing="0" cellpadding="0">
<tbody>
<tr>
<td width="350" height="350"><img
src="http://images.vjia.com/Others/2013/3/28/__aW1nMDI=__20130328184454_8831.jpg"<!--要截取的就是这里的部分-->
width="350" height="350" alt="" /></td>
<td><span name="" value=""></td>
<td width="150" align="center"><table width="135"
height="350" border="0" cellpadding="0" cellspacing="0">
<tbody>
<tr>
<td height="137" valign="top"><a href="#"
target="_blank"></a><a
href="http://item.vjia.com/3493682.html"
target="_blank"><img
src="http://images.vjia.com/Others/2013/3/28/__aW1nMDI=__20130328184538_0793.jpg"<!--要截取的就是这里的部分-->
width="135" height="135" alt="" /></a></td>
</tr>
<tr>
<td align="center"><img
src="http://images.vjia.com/ShopPic/20121009/20121009105859_8061.jpg"<!--要截取的就是这里的部分-->
width="66" height="66" /></td>
</tr>
<tr>
<td height="137" valign="bottom"><a href="#"
target="_blank"></a><a
href="http://item.vjia.com/3510217.html"
target="_blank"><img
src="http://images.vjia.com/Others/2013/3/28/__aW1nMDI=__20130328184633_7873.jpg"
width="135" height="135" alt="" /></a></td>
</tr>
</tbody>
</table></td>
<td><table width="194" height="350" border="0"
cellpadding="0" cellspacing="0">
<tbody>
<tr>
<td height="175"><table width="180" height="112"
border="0" align="center" cellpadding="0"
cellspacing="0">
<tbody>
<tr>
<td height="112" valign="top"><div align="left">
百褶裙11S20342<br />
<s>市 场 价:¥900.00 </s><br />特 惠 价:¥270.00 免运费
</div></td>
</tr>
</tbody>
</table></td>
</tr>
<tr>
<td><table width="180" height="106" border="0"
align="center" cellpadding="0" cellspacing="0">
<tbody>
<tr>
<td height="97" valign="bottom"><div
align="left">
休闲鞋121-9608-0<br />
<s>市 场 价:¥1158.00</s><br />特 惠 价:¥579.00 免运费
</div></td>
</tr>
</tbody>
</table></td>
</tr>
</tbody>
</table></td>
</tr>
</tbody>
</table></td>
</tr>
</tbody>
</table>
<table width="750" border="0" cellspacing="0" cellpadding="0">
<tbody>
<tr>
<td><img
src="http://images.vjia.com/ShopPic/20121009/20121009112603_2004.jpg"
width="750" height="49" /></td>
</tr>
</tbody>
</table>
<table width="700" border="0" align="center" cellpadding="0"
cellspacing="0">
<tbody>
<tr>
<td><img
src="http://images.vjia.com/ShopPic/20121009/20121009135304_9195.jpg"
width="700" height="417" /></td>
</tr>
</tbody>
</table></td>
</tr>
</tbody>
</table>
</div>
html页面代码如上面的,这也是我现在的一项工作吧。。就是把商品页的图片定向的抓取下来。。
我现在的思路是根据唯一确定的img图片来判断table 的位置,截取包含img图片的那个table 的后面那个table。这就是所需要的table ,然后在这个table 里面根据正则截图图片地址。。
代码如下:
String img="http://images.vjia.com/ShopPic/20121008/20121008151836_8158.jpg";//标记图片位置
String html="上面的那段htmkl";
public String getTable(String img,String html){
Pattern p=Pattern.compile("(?s).*?<table.*?"+img+".*?</table>.*?<table>.*?<table(.*?)</table>");
Matcher m=p.matcher(html);
String str = null;
while(m.find()){
str=m.group(1);
}
if(str.equals("")||str==null){
return null;
}
return str;
}
这样返回的是一段table代码,在table里面缩小了范围找就好找图片了。。
但是现在遇到一个问题就是我代码中的那个正则效率是很低的,因为范围是整个html,而且用了多个.*?这样的标记,所以说匹配一次table竟然需要足足3秒,这在大数据量的前提下是不被允许的。。
求改进正则,或者逻辑上能实现也行。
提高一下效率。。
public String getTable(String img,String html){
html=html.substring(html.indexOf(img),html.length());
Pattern p=Pattern.compile("(?s).*?<tbody>(.*?)</tbody>");
Matcher m=p.matcher(html);
String str = null;
while(m.find()){
str=m.group(1);
return str;
}
return null;
}
public ImgVjiaentity getImgUrl(String html,String mjid) {
ImgVjiaentity entity = new ImgVjiaentity();
int i = 1;
Pattern p2 = Pattern.compile("(?s)src=\"(.*?)\"");
Matcher m2 = p2.matcher(html);
while (m2.find()) {
String imgsrc = m2.group(1);
if (imgsrc.contains("Others")){
if(i==1){
entity.setMainImgUrl(imgsrc);
i++;
}
else if(i==2){
entity.setRightupImgUrl(imgsrc);
i++;
}
else if(i==3)
entity.setRightdownImgUrl(imgsrc);
}
}
entity.setMjid(mjid);
return entity;
}
.*?出现的多了,效率也就低了,比如<td><img src出现的比较多,可以完全排除不是以<td><img开头的,效率将远远超过(?s)src
而不是直接的<td><img src
就是怎么截取这个table出现了效率低的问题
String img = "http://images.vjia.com/ShopPic/20121008/20121008151836_8158.jpg";
String tableStart = "<table";
String tableEnd = "</table>";
long oldT = System.currentTimeMillis();
try{
int pos = s.indexOf(tableStart,s.indexOf(img));
String table = s.substring(pos,s.indexOf(tableEnd,pos)+tableEnd.length()-1);
System.out.println(table);
System.out.println(System.currentTimeMillis() - oldT);
}catch(Exception e){
e.printStackTrace();
}
}
String html="上面的那段htmkl";换个思路考虑问题如何?
不要把html看成字符串,建立一个xml文档,采用流或这基于事件解析xml文档
循环子table,
(指定src属性的img所在的table 有其他规则的话,提前判断,减少不必要的img搜索)
如果子table下面的img元素 的属性src= 指定值,找到下面的table(目标table)如果子table下的img元素没有符合要求,继续寻找下一个。
如果有代码最好了,dom解析没怎么用过,所以没使用这种方式。
按照这种思路下去这个问题这种方式应该可以解决。
Parser parser = new Parser("http://tieba.baidu.com/p/"+id+"?pn="+i);
NodeFilter filter = new AndFilter(new TagNameFilter("img"),new HasAttributeFilter("class","BDE_Image")) ;
NodeList nodes = parser.extractAllNodesThatMatch(filter);
imgNum = nodes.size();
if (imgNum > 0) {
for (NodeIterator ni = nodes.elements(); ni.hasMoreNodes();) {
pwriter.println(ni.nextNode().toHtml());
}
}
* 搜索候选table-随着规则的变化需要修改此类
* @param srcValue
* @param root
* @return
*/
@SuppressWarnings("unchecked")
public Element searchTable(String srcValue,Element root) {
Element targetTable = null;
List<Element> candidacyTables = root.selectNodes("table/tbody/tr/td/table");
int length = candidacyTables.size();
for (int i = 0; i < length; i++) {
Element tableElement = candidacyTables.get(i);
List<Element> imgElements = tableElement.selectNodes("tbody/tr/td/img");
if (imgElements != null && imgElements.size() == 1) {
String src = imgElements.get(0).attribute("src").getValue();
if (srcValue.equals(src)) {
targetTable = candidacyTables.get(i + 1);
break;
}
}
}
return targetTable;
}
/**
* 处理table以获取图片来源
* @param table
* @return
*/
@SuppressWarnings("unchecked")
public List<String> handleTableToObtainImgSrc(Element table) {
if (table == null) {
throw new IllegalArgumentException("parameter table should not be null!");
}
List<String> result = new ArrayList<String>();
List<Element> lstImg = table.selectNodes("tbody//img[@src]");
if (lstImg != null && lstImg.size() >0) {
for (Element imgElement : lstImg) {
String srcValue = imgElement.attribute("src").getValue();
result.add(srcValue);
}
}
return result;
}
public static void main(String[] args) throws DocumentException {
long start = System.currentTimeMillis();
Document doc = Dom4jHelper.readFile("src/main/resources/book.xml");
Element root = Dom4jHelper.getRootElement(doc);
Element table = new SearchTable().searchTable("http://images.vjia.com/ShopPic/20121009/20121009104747_8492.jpg", root);
List<String> lstSrc = new SearchTable().handleTableToObtainImgSrc(table);
long end = System.currentTimeMillis();
System.out.println("cost time :" + (end -start));
for (String src : lstSrc) {
System.out.println(src);
}
}帮助类:public class Dom4jHelper {
/**
* 读取文档
* @param path 文件路径
* @return 文档对象
* @throws DocumentException
*/
public static Document readFile(String path) throws DocumentException {
SAXReader reader = new SAXReader();
return reader.read(new File(path));
}
/**
* 获取root元素
* @param doc
* @return
*/
public static Element getRootElement(Document doc) {
return doc.getRootElement();
}
}