import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* Title:天涯只看楼主功能(只用与情感天地且有时候解析不正常...)
* Description:
* Date: 2010-6-5
* Copyright:Copyright (c) 2010
* @version 1.0
*/public class TianYa {
static int currentPage = 1;//当前页面
static String titleName;//标题
static String autherName;//作者
/**
*
* @param tar 传入页面地址,
* @return 返回该页面的String形式
* @throws IOException
*/
static String getContent(String tar) throws IOException {
URL url = new URL(tar);
InputStream is = url.openStream();
InputStreamReader isr = new InputStreamReader(is,"gb2312");
char[] buffer = new char[1024];
int pos = 0;
StringBuilder sb = new StringBuilder();
while((pos = isr.read(buffer)) != -1){
sb.append(new String(buffer,0,pos));
}
isr.close();
is.close();
return sb.toString();
} /**
* 根据作者找出作者的所有回帖
* @param content 页面内容
* @param fos 输出文件(流)
* @return
* @throws MalformedURLException
* @throws IOException
*/
static String readLZ(String content,FileOutputStream fos) throws MalformedURLException, IOException{
Matcher allContent = Pattern.compile("(?ms)(?:"+autherName+"</a>.+?</table>)(.+?)(?><TABLE)").matcher(content);
//找到作者的发言
while(allContent.find()){
fos.write(allContent.group(1).getBytes());
fos.write("<br>".getBytes());
} System.out.println("当前读取第"+(currentPage++)+"页");
Matcher nextPage = Pattern.compile("(?:<a\\s+href=)([^>]+)(?:><[^<]*下一页)").matcher(content);
//如果有下一页,找到下一页
if(nextPage.find()){
return nextPage.group(1);
}else{
return null;
}
}
/**
* 页面解析,读取页面的标题和作者
* @param content
* @param fos
* @throws IOException
*/
static void analysis(String content,FileOutputStream fos) throws IOException{
Matcher title = Pattern.compile("(?:<META\\s+NAME=\"Description\" CONTENT=\"([^\"]*))").matcher(content);
//匹配标题
if(title.find()){
titleName = title.group(1);
}
System.out.println("标题:"+(titleName == null?"未能读取标题":titleName));
fos.write(("标题:"+(titleName == null?"未能读取标题":titleName)).getBytes());
Matcher auther = Pattern.compile("(?:chrAuthorName\\s*=\\s*['\"])([^'\"]+)").matcher(content);
//匹配作者
if(auther.find()){
autherName = auther.group(1);
}
System.out.println("作者:"+(autherName == null?"作者读取失败":autherName));
fos.write(("作者:"+(autherName == null?"作者读取失败":autherName)).getBytes());
}
public static void main(String[] args) {
String startPage = "http://www.tianya.cn/publicforum/Content/feeling/1/1210885.shtml";//首页地址
String saveFile = "E:\\mytianya.html";//保存文件地址
boolean isAppend = false;//覆盖还是追加到文件末尾
FileOutputStream fos = null;
try {
fos = new FileOutputStream(saveFile,isAppend);
String content = getContent(startPage);
analysis(content,fos);
String nextPage = readLZ(content,fos);
while(nextPage != null){
nextPage = readLZ(getContent(nextPage),fos);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally{
if(fos != null){
try {
fos.flush();
fos.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
System.out.println("读取完毕!");
}
}
睡不着.发段代码,帮忙看下,哪里的代码习惯不好
解决方案 »
免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货