java读取网页源代码 小弟因工作需要写一个类,用于获得网页源代码:要求传入的url是随机的,且返回的源代码不能有乱码。小弟写了一天都没解决,跪求大侠们帮下忙。。 (随便传入一个URL,都要能得到其源代码。不能打开的则返回一个字符串"页面不存在")小弟没分了,求高手们别嫌少哈。 解决方案 » 免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货 利用HttpClient获取网页内容http://lixinye0123.javaeye.com/blog/318299这是别人的代码~ public class URLSource { //链表list用来存储相关的网页链接 static private List<String> list = new LinkedList<String>(); public static void downLoad(String eventName, int documentsNumber) { String source=null;//存储网页源文件 //采用谷歌资讯搜索,网页内容按相关度排序 //String url = http://news.google.cn/archivesearch?q=躲猫猫&num=50&hl=zh-CN&ned=ccn&scoring=a String url = "http://news.google.cn/archivesearch?q="+eventName+"&num="+documentsNumber+"&hl=zh-CN&ned=ccn&scoring=a"; source = getSource(url); //抽取每个网页的正文内容 analyzer(source); } //抽取网页的源文件 private static String getSource(String link) { String charset = "GBK";//网页默认编码设置为GBK URLConnection connection = null; try { URL url = new URL(link); //打开连接 connection = url.openConnection(); //如果网页无法打开 if(null == connection) return null; //下载裸源文件 byte[] buf = new byte[2048]; InputStream is = null; ByteArrayOutputStream os = new ByteArrayOutputStream(); int count = 0; try { is = connection.getInputStream(); while ((count = is.read(buf)) >= 0) { os.write(buf, 0, count); } }catch (Exception e) { e.printStackTrace(); if (os.size() == 0) { return null; } } finally { try{is.close(); } catch(Exception e){} } //获取网页的编码格式 String content = os.toString(); int fromIndex = content.indexOf("charset="); charset = content.substring(fromIndex+8, content.indexOf("\"", fromIndex)); return new String(os.toByteArray(), charset); }catch (Exception e) { e.printStackTrace(); } return null; }} HttpClient框架比较容易.! 给你俺以前写过的拿URL做的.!URL urlC = new URL(url); URLConnection connection = urlC.openConnection(); InputStream ips = connection.getInputStream(); FileOutputStream fos = new FileOutputStream(htmlFileName); challage(ips, fos); ips.close(); fos.close();private static void challage(InputStream ips, OutputStream ops) throws IOException { byte[] contents = new byte[1024]; int len = 0; while((len = ips.read(contents)) != -1){ ops.write(contents,0,len); } } URL urlC = new URL(url); URLConnection connection = urlC.openConnection(); InputStream ips = connection.getInputStream(); FileOutputStream fos = new FileOutputStream(htmlFileName); challage(ips, fos); ips.close(); fos.close();private static void challage(InputStream ips, OutputStream ops) throws IOException { byte[] contents = new byte[1024]; int len = 0; while((len = ips.read(contents)) != -1){ ops.write(contents,0,len); } }差不多就是这样吧。。 非常难!! 寻物启事:sql server 2005驱动包 JAVA不定参数探秘(想了解内部原理的都可以进来看一下) 为什么我觉得java比C/C++难 经典作品--Java解惑谜题连载③ 如何判断String中的一段是否是数字? 我的java队列如何让Delphi程序插数据? 怎样将整数变为String? 我女朋友老师布置的一道简单的作业!要求用java实现。 散分了!!!到底那个开发工具性价比最好,大家进来讨论!! 有个消费者线程退不出来,帮解决下 两个double数相乘,精度不准确,如何解决
http://lixinye0123.javaeye.com/blog/318299
这是别人的代码~
//链表list用来存储相关的网页链接
static private List<String> list = new LinkedList<String>();
public static void downLoad(String eventName, int documentsNumber) {
String source=null;//存储网页源文件
//采用谷歌资讯搜索,网页内容按相关度排序
//String url = http://news.google.cn/archivesearch?q=躲猫猫&num=50&hl=zh-CN&ned=ccn&scoring=a
String url = "http://news.google.cn/archivesearch?q="+eventName+"&num="+documentsNumber+"&hl=zh-CN&ned=ccn&scoring=a";
source = getSource(url);
//抽取每个网页的正文内容
analyzer(source);
} //抽取网页的源文件
private static String getSource(String link) {
String charset = "GBK";//网页默认编码设置为GBK
URLConnection connection = null;
try {
URL url = new URL(link);
//打开连接
connection = url.openConnection();
//如果网页无法打开
if(null == connection)
return null;
//下载裸源文件
byte[] buf = new byte[2048];
InputStream is = null;
ByteArrayOutputStream os = new ByteArrayOutputStream();
int count = 0; try {
is = connection.getInputStream();
while ((count = is.read(buf)) >= 0)
{
os.write(buf, 0, count);
}
}catch (Exception e) {
e.printStackTrace();
if (os.size() == 0)
{
return null;
}
}
finally
{
try{is.close(); } catch(Exception e){}
} //获取网页的编码格式
String content = os.toString();
int fromIndex = content.indexOf("charset=");
charset = content.substring(fromIndex+8, content.indexOf("\"", fromIndex));
return new String(os.toByteArray(), charset);
}catch (Exception e) {
e.printStackTrace();
} return null;
}
}
URLConnection connection = urlC.openConnection();
InputStream ips = connection.getInputStream();
FileOutputStream fos = new FileOutputStream(htmlFileName);
challage(ips, fos);
ips.close();
fos.close();private static void challage(InputStream ips, OutputStream ops) throws IOException {
byte[] contents = new byte[1024];
int len = 0;
while((len = ips.read(contents)) != -1){
ops.write(contents,0,len);
}
}
URL urlC = new URL(url);
URLConnection connection = urlC.openConnection();
InputStream ips = connection.getInputStream();
FileOutputStream fos = new FileOutputStream(htmlFileName);
challage(ips, fos);
ips.close();
fos.close();private static void challage(InputStream ips, OutputStream ops) throws IOException {
byte[] contents = new byte[1024];
int len = 0;
while((len = ips.read(contents)) != -1){
ops.write(contents,0,len);
}
}
差不多就是这样吧。。