抓取网页乱码问题

一下是我用的cyberneko解析的html文件，对于编码是通过sniffCharacterEncoding方法从html页面读取的。
package com.wintim.corpus.reader;import java.nio.charset.Charset;
import java.io.*;
import java.util.regex.*;
import org.cyberneko.html.parsers.*;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.w3c.dom.*;
import org.apache.html.dom.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;import com.wintim.corpus.reader.html.NodeWalker;public class HtmlDocReader implements DocReader {
public static final Log LOG = LogFactory
.getLog("org.apache.nutch.parse.html"); // I used 1000 bytes at first, but found that some documents have
// meta tag well past the first 1000 bytes.
// (e.g. http://cn.promo.yahoo.com/customcare/music.html)
private static final int CHUNK_SIZE = 2000;
private static Pattern metaPattern = Pattern.compile(
"<meta\\s+([^>]*http-equiv=\"?content-type\"?[^>]*)>",
Pattern.CASE_INSENSITIVE);
private static Pattern charsetPattern = Pattern.compile(
"charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE); private InputStream is; public HtmlDocReader(InputStream is) {
this.is = is;
} public HtmlDocReader(File file) {
try {
this.is = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
} public HtmlDocReader(String file) {
try {
this.is = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
} private static String sniffCharacterEncoding(InputStream in) {
in.(CHUNK_SIZE);
byte[] b = new byte[CHUNK_SIZE];
int length = 0;
try {
length = in.read(b, 0, CHUNK_SIZE);
in.reset();
} catch (IOException e1) {
e1.printStackTrace();
}
// We don't care about non-ASCII parts so that it's sufficient
// to just inflate each byte to a 16-bit value by padding.
// For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
// {U+0041, U+0082, U+00B7}.
String str = "";
try {
str = new String(b, 0, length, Charset.forName("ASCII").toString());
} catch (UnsupportedEncodingException e) {
// code should never come here, but just in case...
return null;
} Matcher metaMatcher = metaPattern.matcher(str);
String encoding = null;
if (metaMatcher.find()) {
Matcher charsetMatcher = charsetPattern.matcher(metaMatcher
.group(1));
if (charsetMatcher.find())
encoding = new String(charsetMatcher.group(1));
}
return encoding;
} public String read() {
String text = "";
// parse the content
DocumentFragment root = null;
BufferedInputStream bufIs = new BufferedInputStream(is);
String encoding = sniffCharacterEncoding(bufIs);
InputSource input = new InputSource(bufIs);
input.setEncoding(encoding);
try {
root = parse(input);
} catch (Exception e) {
e.printStackTrace();
}
text = getText(root); //extract text
return text;
}

private String getText(Node node) {
NodeWalker walker = new NodeWalker(node);
StringBuffer sb = new StringBuffer();;
while (walker.hasNext()) {
Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
if ("script".equalsIgnoreCase(nodeName)) {
walker.skipChildren();
}
if ("style".equalsIgnoreCase(nodeName)) {
walker.skipChildren();
}
if (nodeType == Node.COMMENT_NODE) {
walker.skipChildren();
}
if (nodeType == Node.TEXT_NODE) {
// cleanup and trim the value
String text = currentNode.getNodeValue();
text = text.replaceAll("\\s+", " ");
text = text.trim();
if (text.length() > 0) {
if (sb.length() > 0)
sb.append(' ');
sb.append(text);
}
}
}
return sb.toString();
}
private DocumentFragment parse(InputSource input) throws Exception {
DOMFragmentParser parser = new DOMFragmentParser();
try {
parser.setFeature("http://cyberneko.org/html/features/augmentations",
true);
parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset",
true);
parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
false);
parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment",
true);
parser.setFeature("http://cyberneko.org/html/features/report-errors",
LOG.isTraceEnabled());
} catch (SAXException e) {
}
// convert Document to DocumentFragment
HTMLDocumentImpl doc = new HTMLDocumentImpl();
doc.setErrorChecking(false);
DocumentFragment res = doc.createDocumentFragment();
DocumentFragment frag = doc.createDocumentFragment();
parser.parse(input, frag);
res.appendChild(frag);
try {
while (true) {
frag = doc.createDocumentFragment();
parser.parse(input, frag);
if (!frag.hasChildNodes())
break;
if (LOG.isInfoEnabled()) {
LOG.info(" - new frag, " + frag.getChildNodes().getLength()
+ " nodes.");
}
res.appendChild(frag);
}
} catch (Exception x) {
x.printStackTrace();
}
return res;
} public static void main(String[] args) throws Exception {
HtmlDocReader excelReader = new HtmlDocReader(new File("test/html/htmltest.html"));
String content = "无标题文档 this is a 中文文档!";
System.out.println(excelReader.read().trim());
}}package com.wintim.corpus.reader.html;import java.util.Stack;import org.w3c.dom.Node;
import org.w3c.dom.NodeList;/**
* A utility class that allows the walking of any DOM tree using a stack
* instead of recursion. As the node tree is walked the next node is popped
* off of the stack and all of its children are automatically added to the
* stack to be called in tree order.
*
* Currently this class is not thread safe. It is assumed that only one
* thread will be accessing the <code>NodeWalker</code> at any given time.
*/
public class NodeWalker { // the root node the the stack holding the nodes
 private Node currentNode;
 private NodeList currentChildren;
 private Stack<Node> nodes;

 /**
 * Starts the <code>Node</code> tree from the root node.
 *
 * @param rootNode
 */
 public NodeWalker(Node rootNode) { nodes = new Stack<Node>();
 nodes.add(rootNode);
 }

 /**
 * Returns the next <code>Node</code> on the stack and pushes all of its
 * children onto the stack, allowing us to walk the node tree without the
 * use of recursion. If there are no more nodes on the stack then null is
 * returned.
 *
 * @return Node The next <code>Node</code> on the stack or null if there
 * isn't a next node.
 */
 public Node nextNode() {

 // if no next node return null
 if (!hasNext()) {
 return null;
 }

 // pop the next node off of the stack and push all of its children onto
 // the stack
 currentNode = nodes.pop();
 currentChildren = currentNode.getChildNodes();
 int childLen = (currentChildren != null) ? currentChildren.getLength() : 0;

 // put the children node on the stack in first to last order
 for (int i = childLen - 1; i >= 0; i--) {
 nodes.add(currentChildren.item(i));
 }

 return currentNode;
 }

 /**
 * Skips over and removes from the node stack the children of the last
 * node. When getting a next node from the walker, that node's children
 * are automatically added to the stack. You can call this method to remove
 * those children from the stack.
 *
 * This is useful when you don't want to process deeper into the
 * current path of the node tree but you want to continue processing sibling
 * nodes.
 *
 */
 public void skipChildren() {

 int childLen = (currentChildren != null) ? currentChildren.getLength() : 0;

 for (int i = 0 ; i < childLen ; i++) {
 Node child = nodes.peek();
 if (child.equals(currentChildren.item(i))) {
 nodes.pop();
 }
 }
 }

 /**
 * Returns true if there are more nodes on the current stack.
 * @return
 */
 public boolean hasNext() {
 return (nodes.size() > 0);
 }
}

又出现utf-8的编码网页，个别字符是乱码，其他正常

程序里面的编码不要写死了,可以在得到网页源码后,取得content="text/html; charset=utf-8"中的编码格式,再设置new String()的编码.我以前用BufferedReader reader = new BufferedReader(new InputStreamReader(inputstream));String str = reader.readLine()时也碰到过你这种情况,后来改为inputstream.read(byte[])就没出现过.
关注中..如果LZ解决了问题,希望在这里分享下.

哈哈，搞定了。在读取、输出的过程中没有用String型转换，之前都是先转到string，再去write。
后来看了下weblech的启发，发现原来可以不用通过String来输出，直接write(byte[])

所以说，完全没有必要去判断页面的编码方式，只要你保存成html文件，然后用浏览器打开，肯定是正常的。
如果你要转换成String，那就要注意编码方式了。

调试易

抓取网页乱码问题

解决方案 »