我用apache httpclient 4.1.1抓取网页,用String的indexof方法搜索其中是否含有感兴趣的关键字,搜索GBK、GB2312编码网页时正常,遇到UFT-8编码网页就无法搜索,抓取下来的中文内容打印出来也是无法辨认。肯定是编码问题了,不知该怎么解决。搜索了好长时间,试了各种转换编码方法,但都不能把抓取下来的中文内容正常打印出来,搜索也都是-1.
解决方案 »
- JAVA无法卸载!Windows Installer程序包有问题,此安装需要的DLL不能运行
- 关于IOException
- 初学者求助谢谢
- 有Jgraph例子或资源的给小弟发点
- 客户端请求信息
- 140分求字符串转换成double完美方法
- http://www.2qq.cn/vip.htm?qq=282232 腾讯过大年,六位数的QQ不要要钱,点击马上申请!看看有分。呵呵,
- 使用resultset 在stringbuffer里面替换文件里所有的字
- CORBA的初学者问题?
- import java.util.*;为什么无法把该包下的类都导入呢
- java中的set和get方法和构造函数什么时候一块用?
- 小弟有关socket服务器送过来数组客户端该如何接收,请教下各位大侠们!!谢谢
人家网页编码相当于发送方是utf-8你的流接受的时候相当于接受方必须也要是utf-8这样才行。
UTF-8编码,得UTF-8解码了;所以得先探测出网页编码,然后用相应的编码解码。
不知道apache httpclient 4.1.1能否获取下面信息:
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
或者
使用cpDetector先检测出字符集编码,再用此字符编码来解码。
现在就想知道如何处理这UTF-8字符串
例如将ISO8859-1格式转换为UTF-8
String old = "XXX" //假设采用的ISO8859-1编码格式
String New = new String(old.getbyte("ISO8859-1"),"UTF-8");//New的编码格式为UTF-8
这样让apache httpclient 4.1.1用指定的charset来解码网页;再来进行你想要的操作。
import java.io.InputStreamReader;import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.DefaultHttpClient;public class EncodedPostTest {
public static void main(String[] args) throws Exception {
HttpClient httpclient = new DefaultHttpClient();
BufferedReader bufReader = null;
String charset = "";
try {
HttpPost httppost = new HttpPost(
"http://localhost:8080/TestJEEProject/EncodingServlet");
HttpResponse response = httpclient.execute(httppost);
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
if (response.getEntity().getContentEncoding() != null) {
charset = response.getEntity().getContentEncoding().getValue();
}else if(response.getEntity().getContentType() != null){
String contentType = response.getEntity().getContentType().getValue().toLowerCase().replaceAll("\\s*", "");
charset = contentType.substring(contentType.indexOf("charset=") + "charset=".length());
}else{
// //TODO: 使用默认字符编码
charset = "gbk";
}
System.out.println("Charset : " + charset);
bufReader = new BufferedReader(new InputStreamReader(response.getEntity().getContent(), charset));
String strValue = bufReader.readLine();
while(strValue != null){
if(strValue.indexOf("编码") != -1){
System.out.println(strValue);
}
strValue = bufReader.readLine();
}
} else {
System.out.println("Unexpected failure: "
+ response.getStatusLine().toString());
}
} finally {
httpclient.getConnectionManager().shutdown();
if(bufReader != null){
bufReader.close();
}
}
}
}
Servlet:import java.io.IOException;
import java.io.PrintWriter;import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;/**
* Servlet implementation class EncodingServlet
*/
public class EncodingServlet extends HttpServlet {
private static final long serialVersionUID = 1L;
/**
* @see HttpServlet#HttpServlet()
*/
public EncodingServlet() {
super();
} /**
* @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
*/
public void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
doPost(request, response);
} /**
* @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
*/
public void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
//set Charset = GBK
// response.setContentType("text/html;charset=GBK");
// response.setHeader("Content-Encoding", "GBK");
//set Charset = UTF-8
response.setContentType("text/html;charset=UTF-8");
// response.setHeader("Content-Encoding", "UTF-8");
PrintWriter out = response.getWriter();
out.print("如何查找UTF-8字符串中的字符串?(字符编码问题)\n");
out.print("我用apache httpclient 4.1.1抓取网页,抓取下来的中文内容打印出来也是无法辨认。\n");
out.print("用String的indexof方法搜索其中是否含有感兴趣的关键字,\n");
out.print("搜索GBK、GB2312编码网页时正常,遇到UFT-8编码网页就无法搜索,\n");
out.print("肯定是编码问题了,不知该怎么解决。搜索了好长时间,试了各种转换编码方法,\n");
out.print("但都不能把抓取下来的中文内容正常打印出来,搜索也都是-1.");
}
}