网址:http://health.sohu.com/yangshengtang/2010年12月20日,我对它是无语了
我设置了编码:gb2312
设置了代理
设置GET请求
设置了accept
设置了setUseCaches false
设置了setAllowUserInteraction true设了一堆请求属性,可是依然乱码 ....
真让人无语了啊,有哪位高手不信这个邪?用浏览器浏览却不乱码我在想自己的浏览器模拟器哪里出了问题,
高手们...
我设置了编码:gb2312
设置了代理
设置GET请求
设置了accept
设置了setUseCaches false
设置了setAllowUserInteraction true设了一堆请求属性,可是依然乱码 ....
真让人无语了啊,有哪位高手不信这个邪?用浏览器浏览却不乱码我在想自己的浏览器模拟器哪里出了问题,
高手们...
是gzip格式的
就是说它把源文件压缩了传输了过来,
你不解压缩无论咋整都是乱码
我随便写了个程序测试了下
还有一部分是乱码,不清楚哪的问题
String url = "http://health.sohu.com/yangshengtang/"; URL cumtURL = new URL(
url); URLConnection cumtConnection = cumtURL.openConnection();
cumtConnection.setRequestProperty("User-Agent",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13");
cumtConnection.setRequestProperty("Pragma", "no-cache");
cumtConnection.setRequestProperty("Proxy-Connection", "Keep-Alive");
cumtConnection.setRequestProperty("Host", "health.sohu.com");
//Cookie Hm_lvt_9f14aaa038bbba8b12ec2a4a3e51d254=1287650788640; BAIDUID=F25ABEDD87D60C8104D0CF4D75A71979:FG=1; bdime=0; BD_UTK_DVT=1; USERID=fdb673a39972591f66d93243
cumtConnection.setRequestProperty("Cookie", "YYID=C52798160DB2F2EB9A5C522772777A83; SUV=0812111111371033; vjuids=3f340bb23.11ed3e18865.0.7e372d1208008; vjlast=1231915420,1231915420,30.1292830395.10; IPLOC=CN6101");
cumtConnection.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
cumtConnection.setRequestProperty("Accept-Language", "zh-cn,zh;q=0.5");
cumtConnection.setRequestProperty("Pragma", "no-cache");
cumtConnection.setRequestProperty("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");
cumtConnection.setRequestProperty("Accept-Encoding", "gzip,deflate");
// System.out.println(cumtConnection.getContentEncoding());
InputStream urlStream = cumtConnection.getInputStream();
BufferedInputStream buff = new BufferedInputStream(urlStream);
Reader r = new InputStreamReader(buff,"iso-8859-1");
BufferedReader br = new BufferedReader(r);
StringBuffer strHtml = new StringBuffer();
String strLine = null;
while ((strLine = br.readLine()) != null) {
//System.out.println(strLine);
//strLine = new String(strLine.getBytes("gb2312"),"utf-8");
//System.out.println("aa:"+strLine);
strHtml.append(strLine);
}
// System.out.println(strHtml);
GZIPInputStream gis = new GZIPInputStream(new java.io.ByteArrayInputStream(strHtml.toString().getBytes("iso-8859-1")));
int count;
byte data[] = new byte[2048];
while ((count = gis.read(data, 0, 1024)) != -1) {
System.out.println(new String(data,"gbk"));
} gis.close();
Document document = Jsoup.connect("http://health.sohu.com/yangshengtang/").get();
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.zip.DeflaterInputStream;
import java.util.zip.GZIPInputStream;public class Test2 { public static void main(String[] args) throws IOException {
URL url = new URL("http://health.sohu.com/yangshengtang/");
String content = doGet(url, "gb2312");
System.out.println(content);
} public static String doGet(URL url, String charset) throws IOException {
URLConnection con = url.openConnection();
ContentEncoding ce = ContentEncoding.getInstance( con.getContentEncoding() );
byte[] bys = read(ce, con.getInputStream());
return new String(bys, charset);
} private static byte[] read(ContentEncoding ce, InputStream in) throws IOException {
try {
return ce.getBytes(in);
} finally {
in.close();
}
} public static enum ContentEncoding { DEFAULT, GZIP {
protected InputStream wrap(InputStream in) throws IOException {
return new GZIPInputStream(in);
}
}, DEFLATE {
protected InputStream wrap(InputStream in) throws IOException {
return new DeflaterInputStream(in);
}
}; public static ContentEncoding getInstance(String encoding) {
if(encoding == null || encoding.trim().length() == 0) {
return DEFAULT;
}
for(ContentEncoding e : values()) {
if(e.name().equalsIgnoreCase(encoding)) {
return e;
}
}
return DEFAULT;
} protected InputStream wrap(InputStream in) throws IOException {
return in;
} public byte[] getBytes(InputStream in) throws IOException {
InputStream wrap = wrap(in);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] bys = new byte[8196];
for (int r = -1; (r = wrap.read(bys)) != -1;) {
baos.write(bys, 0, r);
}
return baos.toByteArray();
}
}
}