一下是我用的cyberneko解析的html文件,对于编码是通过sniffCharacterEncoding方法从html页面读取的。 package com.wintim.corpus.reader;import java.nio.charset.Charset; import java.io.*; import java.util.regex.*; import org.cyberneko.html.parsers.*; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.w3c.dom.*; import org.apache.html.dom.*; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory;import com.wintim.corpus.reader.html.NodeWalker;public class HtmlDocReader implements DocReader { public static final Log LOG = LogFactory .getLog("org.apache.nutch.parse.html"); // I used 1000 bytes at first, but found that some documents have // meta tag well past the first 1000 bytes. // (e.g. http://cn.promo.yahoo.com/customcare/music.html) private static final int CHUNK_SIZE = 2000; private static Pattern metaPattern = Pattern.compile( "<meta\\s+([^>]*http-equiv=\"?content-type\"?[^>]*)>", Pattern.CASE_INSENSITIVE); private static Pattern charsetPattern = Pattern.compile( "charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE); private InputStream is; public HtmlDocReader(InputStream is) { this.is = is; } public HtmlDocReader(File file) { try { this.is = new FileInputStream(file); } catch (FileNotFoundException e) { e.printStackTrace(); } } public HtmlDocReader(String file) { try { this.is = new FileInputStream(file); } catch (FileNotFoundException e) { e.printStackTrace(); } } private static String sniffCharacterEncoding(InputStream in) { in.(CHUNK_SIZE); byte[] b = new byte[CHUNK_SIZE]; int length = 0; try { length = in.read(b, 0, CHUNK_SIZE); in.reset(); } catch (IOException e1) { e1.printStackTrace(); } // We don't care about non-ASCII parts so that it's sufficient // to just inflate each byte to a 16-bit value by padding. // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into // {U+0041, U+0082, U+00B7}. String str = ""; try { str = new String(b, 0, length, Charset.forName("ASCII").toString()); } catch (UnsupportedEncodingException e) { // code should never come here, but just in case... return null; } Matcher metaMatcher = metaPattern.matcher(str); String encoding = null; if (metaMatcher.find()) { Matcher charsetMatcher = charsetPattern.matcher(metaMatcher .group(1)); if (charsetMatcher.find()) encoding = new String(charsetMatcher.group(1)); } return encoding; } public String read() { String text = ""; // parse the content DocumentFragment root = null; BufferedInputStream bufIs = new BufferedInputStream(is); String encoding = sniffCharacterEncoding(bufIs); InputSource input = new InputSource(bufIs); input.setEncoding(encoding); try { root = parse(input); } catch (Exception e) { e.printStackTrace(); } text = getText(root); //extract text return text; }
private String getText(Node node) { NodeWalker walker = new NodeWalker(node); StringBuffer sb = new StringBuffer();; while (walker.hasNext()) { Node currentNode = walker.nextNode(); String nodeName = currentNode.getNodeName(); short nodeType = currentNode.getNodeType(); if ("script".equalsIgnoreCase(nodeName)) { walker.skipChildren(); } if ("style".equalsIgnoreCase(nodeName)) { walker.skipChildren(); } if (nodeType == Node.COMMENT_NODE) { walker.skipChildren(); } if (nodeType == Node.TEXT_NODE) { // cleanup and trim the value String text = currentNode.getNodeValue(); text = text.replaceAll("\\s+", " "); text = text.trim(); if (text.length() > 0) { if (sb.length() > 0) sb.append(' '); sb.append(text); } } } return sb.toString(); } private DocumentFragment parse(InputSource input) throws Exception { DOMFragmentParser parser = new DOMFragmentParser(); try { parser.setFeature("http://cyberneko.org/html/features/augmentations", true); parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true); parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content", false); parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true); parser.setFeature("http://cyberneko.org/html/features/report-errors", LOG.isTraceEnabled()); } catch (SAXException e) { } // convert Document to DocumentFragment HTMLDocumentImpl doc = new HTMLDocumentImpl(); doc.setErrorChecking(false); DocumentFragment res = doc.createDocumentFragment(); DocumentFragment frag = doc.createDocumentFragment(); parser.parse(input, frag); res.appendChild(frag); try { while (true) { frag = doc.createDocumentFragment(); parser.parse(input, frag); if (!frag.hasChildNodes()) break; if (LOG.isInfoEnabled()) { LOG.info(" - new frag, " + frag.getChildNodes().getLength() + " nodes."); } res.appendChild(frag); } } catch (Exception x) { x.printStackTrace(); } return res; } public static void main(String[] args) throws Exception { HtmlDocReader excelReader = new HtmlDocReader(new File("test/html/htmltest.html")); String content = "无标题文档 this is a 中文文档!"; System.out.println(excelReader.read().trim()); }}package com.wintim.corpus.reader.html;import java.util.Stack;import org.w3c.dom.Node; import org.w3c.dom.NodeList;/** * <p>A utility class that allows the walking of any DOM tree using a stack * instead of recursion. As the node tree is walked the next node is popped * off of the stack and all of its children are automatically added to the * stack to be called in tree order.</p> * * <p>Currently this class is not thread safe. It is assumed that only one * thread will be accessing the <code>NodeWalker</code> at any given time.</p> */ public class NodeWalker { // the root node the the stack holding the nodes private Node currentNode; private NodeList currentChildren; private Stack<Node> nodes;
/** * Starts the <code>Node</code> tree from the root node. * * @param rootNode */ public NodeWalker(Node rootNode) { nodes = new Stack<Node>(); nodes.add(rootNode); }
/** * <p>Returns the next <code>Node</code> on the stack and pushes all of its * children onto the stack, allowing us to walk the node tree without the * use of recursion. If there are no more nodes on the stack then null is * returned.</p> * * @return Node The next <code>Node</code> on the stack or null if there * isn't a next node. */ public Node nextNode() {
// if no next node return null if (!hasNext()) { return null; }
// pop the next node off of the stack and push all of its children onto // the stack currentNode = nodes.pop(); currentChildren = currentNode.getChildNodes(); int childLen = (currentChildren != null) ? currentChildren.getLength() : 0;
// put the children node on the stack in first to last order for (int i = childLen - 1; i >= 0; i--) { nodes.add(currentChildren.item(i)); }
return currentNode; }
/** * <p>Skips over and removes from the node stack the children of the last * node. When getting a next node from the walker, that node's children * are automatically added to the stack. You can call this method to remove * those children from the stack.</p> * * <p>This is useful when you don't want to process deeper into the * current path of the node tree but you want to continue processing sibling * nodes.</p> * */ public void skipChildren() {
int childLen = (currentChildren != null) ? currentChildren.getLength() : 0;
for (int i = 0 ; i < childLen ; i++) { Node child = nodes.peek(); if (child.equals(currentChildren.item(i))) { nodes.pop(); } } }
/** * Returns true if there are more nodes on the current stack. * @return */ public boolean hasNext() { return (nodes.size() > 0); } }
package com.wintim.corpus.reader;import java.nio.charset.Charset;
import java.io.*;
import java.util.regex.*;
import org.cyberneko.html.parsers.*;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.w3c.dom.*;
import org.apache.html.dom.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;import com.wintim.corpus.reader.html.NodeWalker;public class HtmlDocReader implements DocReader {
public static final Log LOG = LogFactory
.getLog("org.apache.nutch.parse.html"); // I used 1000 bytes at first, but found that some documents have
// meta tag well past the first 1000 bytes.
// (e.g. http://cn.promo.yahoo.com/customcare/music.html)
private static final int CHUNK_SIZE = 2000;
private static Pattern metaPattern = Pattern.compile(
"<meta\\s+([^>]*http-equiv=\"?content-type\"?[^>]*)>",
Pattern.CASE_INSENSITIVE);
private static Pattern charsetPattern = Pattern.compile(
"charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE); private InputStream is; public HtmlDocReader(InputStream is) {
this.is = is;
} public HtmlDocReader(File file) {
try {
this.is = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
} public HtmlDocReader(String file) {
try {
this.is = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
} private static String sniffCharacterEncoding(InputStream in) {
in.(CHUNK_SIZE);
byte[] b = new byte[CHUNK_SIZE];
int length = 0;
try {
length = in.read(b, 0, CHUNK_SIZE);
in.reset();
} catch (IOException e1) {
e1.printStackTrace();
}
// We don't care about non-ASCII parts so that it's sufficient
// to just inflate each byte to a 16-bit value by padding.
// For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
// {U+0041, U+0082, U+00B7}.
String str = "";
try {
str = new String(b, 0, length, Charset.forName("ASCII").toString());
} catch (UnsupportedEncodingException e) {
// code should never come here, but just in case...
return null;
} Matcher metaMatcher = metaPattern.matcher(str);
String encoding = null;
if (metaMatcher.find()) {
Matcher charsetMatcher = charsetPattern.matcher(metaMatcher
.group(1));
if (charsetMatcher.find())
encoding = new String(charsetMatcher.group(1));
}
return encoding;
} public String read() {
String text = "";
// parse the content
DocumentFragment root = null;
BufferedInputStream bufIs = new BufferedInputStream(is);
String encoding = sniffCharacterEncoding(bufIs);
InputSource input = new InputSource(bufIs);
input.setEncoding(encoding);
try {
root = parse(input);
} catch (Exception e) {
e.printStackTrace();
}
text = getText(root); //extract text
return text;
}
private String getText(Node node) {
NodeWalker walker = new NodeWalker(node);
StringBuffer sb = new StringBuffer();;
while (walker.hasNext()) {
Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
if ("script".equalsIgnoreCase(nodeName)) {
walker.skipChildren();
}
if ("style".equalsIgnoreCase(nodeName)) {
walker.skipChildren();
}
if (nodeType == Node.COMMENT_NODE) {
walker.skipChildren();
}
if (nodeType == Node.TEXT_NODE) {
// cleanup and trim the value
String text = currentNode.getNodeValue();
text = text.replaceAll("\\s+", " ");
text = text.trim();
if (text.length() > 0) {
if (sb.length() > 0)
sb.append(' ');
sb.append(text);
}
}
}
return sb.toString();
}
private DocumentFragment parse(InputSource input) throws Exception {
DOMFragmentParser parser = new DOMFragmentParser();
try {
parser.setFeature("http://cyberneko.org/html/features/augmentations",
true);
parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset",
true);
parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
false);
parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment",
true);
parser.setFeature("http://cyberneko.org/html/features/report-errors",
LOG.isTraceEnabled());
} catch (SAXException e) {
}
// convert Document to DocumentFragment
HTMLDocumentImpl doc = new HTMLDocumentImpl();
doc.setErrorChecking(false);
DocumentFragment res = doc.createDocumentFragment();
DocumentFragment frag = doc.createDocumentFragment();
parser.parse(input, frag);
res.appendChild(frag);
try {
while (true) {
frag = doc.createDocumentFragment();
parser.parse(input, frag);
if (!frag.hasChildNodes())
break;
if (LOG.isInfoEnabled()) {
LOG.info(" - new frag, " + frag.getChildNodes().getLength()
+ " nodes.");
}
res.appendChild(frag);
}
} catch (Exception x) {
x.printStackTrace();
}
return res;
} public static void main(String[] args) throws Exception {
HtmlDocReader excelReader = new HtmlDocReader(new File("test/html/htmltest.html"));
String content = "无标题文档 this is a 中文文档!";
System.out.println(excelReader.read().trim());
}}package com.wintim.corpus.reader.html;import java.util.Stack;import org.w3c.dom.Node;
import org.w3c.dom.NodeList;/**
* <p>A utility class that allows the walking of any DOM tree using a stack
* instead of recursion. As the node tree is walked the next node is popped
* off of the stack and all of its children are automatically added to the
* stack to be called in tree order.</p>
*
* <p>Currently this class is not thread safe. It is assumed that only one
* thread will be accessing the <code>NodeWalker</code> at any given time.</p>
*/
public class NodeWalker { // the root node the the stack holding the nodes
private Node currentNode;
private NodeList currentChildren;
private Stack<Node> nodes;
/**
* Starts the <code>Node</code> tree from the root node.
*
* @param rootNode
*/
public NodeWalker(Node rootNode) { nodes = new Stack<Node>();
nodes.add(rootNode);
}
/**
* <p>Returns the next <code>Node</code> on the stack and pushes all of its
* children onto the stack, allowing us to walk the node tree without the
* use of recursion. If there are no more nodes on the stack then null is
* returned.</p>
*
* @return Node The next <code>Node</code> on the stack or null if there
* isn't a next node.
*/
public Node nextNode() {
// if no next node return null
if (!hasNext()) {
return null;
}
// pop the next node off of the stack and push all of its children onto
// the stack
currentNode = nodes.pop();
currentChildren = currentNode.getChildNodes();
int childLen = (currentChildren != null) ? currentChildren.getLength() : 0;
// put the children node on the stack in first to last order
for (int i = childLen - 1; i >= 0; i--) {
nodes.add(currentChildren.item(i));
}
return currentNode;
}
/**
* <p>Skips over and removes from the node stack the children of the last
* node. When getting a next node from the walker, that node's children
* are automatically added to the stack. You can call this method to remove
* those children from the stack.</p>
*
* <p>This is useful when you don't want to process deeper into the
* current path of the node tree but you want to continue processing sibling
* nodes.</p>
*
*/
public void skipChildren() {
int childLen = (currentChildren != null) ? currentChildren.getLength() : 0;
for (int i = 0 ; i < childLen ; i++) {
Node child = nodes.peek();
if (child.equals(currentChildren.item(i))) {
nodes.pop();
}
}
}
/**
* Returns true if there are more nodes on the current stack.
* @return
*/
public boolean hasNext() {
return (nodes.size() > 0);
}
}
关注中..如果LZ解决了问题,希望在这里分享下.
后来看了下weblech的启发,发现原来可以不用通过String来输出,直接write(byte[])
如果你要转换成String,那就要注意编码方式了。