我想从一个网页里提取里面的中文,代码是下面的
import java.net.*;
import java.util.*;
import java.io.*;
import java.util.regex.*;public class Main {
public String getHtml(String urlString) {
try {
StringBuffer html = new StringBuffer();
URL url = new URL(urlString);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
InputStreamReader isr = new InputStreamReader(conn.getInputStream());
BufferedReader br = new BufferedReader(isr); String temp;
while ((temp = br.readLine()) != null) {
html.append(temp).append("\n");
}
br.close();
isr.close();
return html.toString();
} catch (Exception e) {
e.printStackTrace();
return null;
}
} static String destination="http://www.baidu.com/s?cl=3&tn=sitesowang&f=5&wd=asdf";
public static void main(String[] args) throws MalformedURLException, IOException{
// TODO code application logic here
Main k=new Main();
String total=k.getHtml(destination);
String val=null;
String patternstrs="^[\u4e00-\u9fa5]+";
Pattern p=Pattern.compile(patternstrs);
Matcher m=p.matcher(total);
while(m.find()){
val=m.group();
System.out.println(val);
}
结果打印出来的是诡异的汉字,我在网上看好像是字符流字节流的问题,但最后也没搞懂,希望大家帮忙看下了,谢谢
import java.net.*;
import java.util.*;
import java.io.*;
import java.util.regex.*;public class Main {
public String getHtml(String urlString) {
try {
StringBuffer html = new StringBuffer();
URL url = new URL(urlString);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
InputStreamReader isr = new InputStreamReader(conn.getInputStream());
BufferedReader br = new BufferedReader(isr); String temp;
while ((temp = br.readLine()) != null) {
html.append(temp).append("\n");
}
br.close();
isr.close();
return html.toString();
} catch (Exception e) {
e.printStackTrace();
return null;
}
} static String destination="http://www.baidu.com/s?cl=3&tn=sitesowang&f=5&wd=asdf";
public static void main(String[] args) throws MalformedURLException, IOException{
// TODO code application logic here
Main k=new Main();
String total=k.getHtml(destination);
String val=null;
String patternstrs="^[\u4e00-\u9fa5]+";
Pattern p=Pattern.compile(patternstrs);
Matcher m=p.matcher(total);
while(m.find()){
val=m.group();
System.out.println(val);
}
结果打印出来的是诡异的汉字,我在网上看好像是字符流字节流的问题,但最后也没搞懂,希望大家帮忙看下了,谢谢
解决方案 »
免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货