我想统计下面HMTL文件中<boyd>下第一个<table>里包含的汉字数
package edu.swjut.paper;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
public class CountLink {
public static void main(String[] args) throws Exception{
//生成html parser
DOMParser parser = new DOMParser();
//设置网页的默认编码
parser.setProperty(
"http://cyberneko.org/html/properties/default-encoding",
"gb18030");
//input file
int m=0;
BufferedReader in = new BufferedReader(new FileReader("e:/1.html"));
parser.parse(new InputSource(in));
Document doc = parser.getDocument();
doc.normalize();
//获得body节点,以此为根,计算其文本内容
NodeList body=doc.getElementsByTagName("body");
Element em=(Element) body.item(0);
NodeList elmt=em.getChildNodes();
for(int k=0;k<elmt.getLength();k++)
{int sum=0;
System.out.println(elmt.item(k).getNodeName());
String label=elmt.item(k).getNodeName();
if(label.equalsIgnoreCase("table"))
{int m1=0;
System.out.println("begin....................");
m1=countLink(elmt.item(k),doc);
System.out.println("sum is******************* "+m1);
//System.out.println("^^^^^^^^^^^@@@@@@@@@@@@@@@@@@^^^^^^^^^^^^^^^^^"+m);
}
}
/* TransformerFactory tFactory=TransformerFactory.newInstance();
Transformer transformer=tFactory.newTransformer();
DOMSource source=new DOMSource(doc);
StreamResult result=new StreamResult(new File("e:/1.html"));
transformer.transform(source,result);*/
}
public static int countLink(Node body,Document doc)//递归有问题
{
String str1="";
int sum=0;
NodeList nl=body.getChildNodes();
System.out.println("table beginning .........................................................");
for(int l=0;l<nl.getLength();l++)
{
if(nl.item(l).getNodeName().equalsIgnoreCase("table")&&hasnotable(nl.item(l)))
{
Element elmt=(Element) nl.item(l);
sum+=countTableLink(elmt,str1,doc);
System.out.println("``````````````````````````````````"+sum);
}
else
countLink(nl.item(l),doc);
}
System.out.println("**************@@@@@@@@@@@@@@@@@@@**********"+sum);
return sum;
}//函数结束
private static int countTableLink(Element elmt,String str1,Document doc) {//用递归实现某个table表格下的汉字数
//table 标签下没有孩子table 那么就开始统计链接个数
Attr words=doc.createAttribute("Words");
int he=0;
int k=0;
Element element;
NodeList nodeList=elmt.getChildNodes();
for(int i=0;i<nodeList.getLength();i++ )
{
Node elt= nodeList.item(i);
if(elt.getNodeType()==Node.ELEMENT_NODE)
{
Element elt1=(Element) elt;
he+=countTableLink(elt1,str1,doc);
}
else if(elt.getNodeType()==Node.TEXT_NODE)
{
if(elt.getNodeValue().trim().length()>0)
{
str1+=elt.getNodeValue().trim();
element=(Element) elt.getParentNode();
if(element.getTagName().equalsIgnoreCase("td"))
{
System.out.print("the str1 value "+str1+" "+element.getNodeName()+element.getTextContent());
}
else{
String str2=element.getTextContent();
while(!element.getTagName().equalsIgnoreCase("td"))
{
element=(Element) element.getParentNode();
}
System.out.println("@@@@@@@@@@@@@@@@@"+element.getNodeName()+str2);
}
int k1=str1.length();
he+=k1;
element.setAttribute(words.getNodeName(), String.valueOf(he));
// element.setAttribute("width=",words);
System.out.println(" k1's value is:"+he);
}
}
}
//System.out.println("return he is~~~~~~~~~~~~~~~~~~~~~~~~~~~~"+he);
return he;
}
private static boolean hasnotable(Node body)
{
boolean f=true;
NodeList nl=body.getChildNodes();
for(int i=0;i<nl.getLength();i++)
{
if( nl.item(i).getNodeName().equalsIgnoreCase("table"))
f=false;
}
return f;
}
}
HTML中源代码如下:
<html>
<body>
<table>
<tr>
<td>
<table>
<tr>
<td>
welcome
</td>
</tr>
</table>
</td>
<td>
<table>
<tr>
<td>
welcome
</td>
</tr>
</table>
</td>
</tr></table>
</body>
</html>
运行时并未统计出结果
package edu.swjut.paper;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
public class CountLink {
public static void main(String[] args) throws Exception{
//生成html parser
DOMParser parser = new DOMParser();
//设置网页的默认编码
parser.setProperty(
"http://cyberneko.org/html/properties/default-encoding",
"gb18030");
//input file
int m=0;
BufferedReader in = new BufferedReader(new FileReader("e:/1.html"));
parser.parse(new InputSource(in));
Document doc = parser.getDocument();
doc.normalize();
//获得body节点,以此为根,计算其文本内容
NodeList body=doc.getElementsByTagName("body");
Element em=(Element) body.item(0);
NodeList elmt=em.getChildNodes();
for(int k=0;k<elmt.getLength();k++)
{int sum=0;
System.out.println(elmt.item(k).getNodeName());
String label=elmt.item(k).getNodeName();
if(label.equalsIgnoreCase("table"))
{int m1=0;
System.out.println("begin....................");
m1=countLink(elmt.item(k),doc);
System.out.println("sum is******************* "+m1);
//System.out.println("^^^^^^^^^^^@@@@@@@@@@@@@@@@@@^^^^^^^^^^^^^^^^^"+m);
}
}
/* TransformerFactory tFactory=TransformerFactory.newInstance();
Transformer transformer=tFactory.newTransformer();
DOMSource source=new DOMSource(doc);
StreamResult result=new StreamResult(new File("e:/1.html"));
transformer.transform(source,result);*/
}
public static int countLink(Node body,Document doc)//递归有问题
{
String str1="";
int sum=0;
NodeList nl=body.getChildNodes();
System.out.println("table beginning .........................................................");
for(int l=0;l<nl.getLength();l++)
{
if(nl.item(l).getNodeName().equalsIgnoreCase("table")&&hasnotable(nl.item(l)))
{
Element elmt=(Element) nl.item(l);
sum+=countTableLink(elmt,str1,doc);
System.out.println("``````````````````````````````````"+sum);
}
else
countLink(nl.item(l),doc);
}
System.out.println("**************@@@@@@@@@@@@@@@@@@@**********"+sum);
return sum;
}//函数结束
private static int countTableLink(Element elmt,String str1,Document doc) {//用递归实现某个table表格下的汉字数
//table 标签下没有孩子table 那么就开始统计链接个数
Attr words=doc.createAttribute("Words");
int he=0;
int k=0;
Element element;
NodeList nodeList=elmt.getChildNodes();
for(int i=0;i<nodeList.getLength();i++ )
{
Node elt= nodeList.item(i);
if(elt.getNodeType()==Node.ELEMENT_NODE)
{
Element elt1=(Element) elt;
he+=countTableLink(elt1,str1,doc);
}
else if(elt.getNodeType()==Node.TEXT_NODE)
{
if(elt.getNodeValue().trim().length()>0)
{
str1+=elt.getNodeValue().trim();
element=(Element) elt.getParentNode();
if(element.getTagName().equalsIgnoreCase("td"))
{
System.out.print("the str1 value "+str1+" "+element.getNodeName()+element.getTextContent());
}
else{
String str2=element.getTextContent();
while(!element.getTagName().equalsIgnoreCase("td"))
{
element=(Element) element.getParentNode();
}
System.out.println("@@@@@@@@@@@@@@@@@"+element.getNodeName()+str2);
}
int k1=str1.length();
he+=k1;
element.setAttribute(words.getNodeName(), String.valueOf(he));
// element.setAttribute("width=",words);
System.out.println(" k1's value is:"+he);
}
}
}
//System.out.println("return he is~~~~~~~~~~~~~~~~~~~~~~~~~~~~"+he);
return he;
}
private static boolean hasnotable(Node body)
{
boolean f=true;
NodeList nl=body.getChildNodes();
for(int i=0;i<nl.getLength();i++)
{
if( nl.item(i).getNodeName().equalsIgnoreCase("table"))
f=false;
}
return f;
}
}
HTML中源代码如下:
<html>
<body>
<table>
<tr>
<td>
<table>
<tr>
<td>
welcome
</td>
</tr>
</table>
</td>
<td>
<table>
<tr>
<td>
welcome
</td>
</tr>
</table>
</td>
</tr></table>
</body>
</html>
运行时并未统计出结果
解决方案 »
免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货