private String encoding = "utf-8"; private URL url; private String pageName; //The original name in the server side private String targetName ; //The name you want when you save the page as a files private String content; private List<String> linkedList = new ArrayList(); private List<Image> imageList = new ArrayList();
//Also can add css,javascript support in the furture void init(){ System.out.println("Begin Init..."); initContent (); initPageName(); initLinks(); initImages(); System.out.println("End Init..."); } //Constructors public Page(URL url) { this.url = url; init(); } public Page(String url) { try { this.url = new URL(url); init(); } catch (MalformedURLException ex) { System.out.println("URL is invalid"); } } public Page(String url,String encoding){ this.encoding = encoding; try { this.url = new URL(url); init(); } catch (MalformedURLException ex) { System.out.println("URL is invalid"); } }//All Public method public String getEncoding() { return encoding; } public void setEncoding(String encoding) { this.encoding = encoding; } public String getContent() { return content; } public URL getUrl() { return url; } public String getPageName() { return pageName; } public List<Image> getImageList() { return imageList; } public List<String> getLinkedList() { return linkedList; }
public void SaveAsFile(String path) throws Exception { System.out.println("Begin Save as a file... :" + targetName); //创建目标目录 File fpath = new File(path); fpath.mkdirs();
SaveAllImages(path); //这里要从CONTENT中输出文件,因为这样才可以更改内容 BufferedReader in = new BufferedReader(new StringReader(content)); try { PrintWriter out = new PrintWriter( new BufferedWriter( new FileWriter(path+ File.separator + targetName))); String inputLine; while ((inputLine = in.readLine()) != null) { String strLine = System.getProperty("line.separator"); out.println(inputLine + strLine); } out.close(); } catch (EOFException e) { System.err.println("End of stream save as a file:"+targetName); } in.close(); System.out.println("Save File Success! On :" + path); }
//All private method private void initContent(){ try { StringBuffer sb = new StringBuffer(); //As a container to contain the content URLConnection conn = url.openConnection(); conn.setDoOutput(true); //我不知道这里要设定什么编码了,郁闷(BIG5反而正常了?) BufferedReader in = new BufferedReader( new InputStreamReader(conn.getInputStream(), encoding)); String inputLine; while ((inputLine = in.readLine()) != null) { sb.append(inputLine); sb.append("\n"); // System.out.println(inputLine); } in.close(); content = sb.toString(); System.out.println("End of initContent!"); } catch (IOException ex) { System.out.println("Can't connect to the URL!"); } } private void initPageName(){ String strUrl = url.getFile(); if ( strUrl.endsWith(".html")|| strUrl.endsWith(".htm")){ int i = strUrl.lastIndexOf("/"); pageName = strUrl.substring(i+1); }else{ pageName = "index.html"; } targetName = pageName; System.out.println( "End of initPageName!The name is :"+targetName); } private void initImages(){ try { Parser p = new Parser(content); p.setEncoding(encoding); String filterType = "img"; NodeFilter filer = new TagNameFilter(filterType); NodeList nl = p.extractAllNodesThatMatch(filer); for (int i = 0; i < nl.size(); i++) { ImageTag imageTag = (ImageTag) nl.elementAt(i); String imageUrl = imageTag.getImageURL(); System.out.println("Image-"+i+" : "+imageUrl); imageList.add(new Image(imageUrl)); } } catch (ParserException ex) { System.out.println("Parse Image Tag Error!"); } System.out.println("End of initImage!"); } private void initLinks(){ try { //Parser html source code to get all linked list in Parser parser; NodeList nodelist; parser = Parser.createParser(content, encoding); NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); OrFilter lastFilter = new OrFilter(); lastFilter.setPredicates(new NodeFilter[]{linkFilter}); nodelist = parser.parse(lastFilter); Node[] nodes = nodelist.toNodeArray(); String line = ""; for (int i = 0; i < nodes.length; i++) { Node node = nodes[i]; LinkTag link = (LinkTag) node; line = link.getLink(); if (isTrimEmpty(line)) { continue; } if (!line.startsWith("http://")&&!line.equals("#")&&!line.equals("index.html")&&(line.endsWith(".html")||line.endsWith(".htm"))) { String sUrl = url.toURI().toString(); line = sUrl.substring(0, sUrl.length()-pageName.length())+line; System.out.println("Link-"+i+" : "+line); linkedList.add(line); } } } catch (URISyntaxException ex) { System.out.println("Can't change the URL to URI!"); } catch (ParserException ex) { System.out.println("Parse Links Tag Error!"); } System.out.println("End of initLinks!"); } private boolean isTrimEmpty(String astr) { if ((null == astr) || (astr.length() == 0)) { return true; } if (isBlank(astr.trim())) { return true; } return false; } private boolean isBlank(String astr) { if ((null == astr) || (astr.length() == 0)) { return true; } else { return false; } } }
3、使用以上类的类(有准备写个界面)package pagecapture;import java.util.List;public class Main { public static void main(String[] args) throws Exception { String sUrl = "http://www.510book.cn/files/article/html/3/3382/index.html"; String dest = "D:/PageCapture";
Page p = new Page(sUrl,"BIG5"); p.SaveAsFile(dest); List<String> l = p.getLinkedList(); for (String s : l) { System.out.println(""); System.out.println("-------------------" + s + "----------------"); Page d = new Page(s); d.SaveAsFile(dest); } } }比较粗糙,不对应该是十分粗糙,我主要就用它下下起点的盗贴(如果是小说迷,又只能在公司上网的话一定有一样的需求)
1、保存图片的类package pagecapture;import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;public class Image { private URL url;
private String imageName; //服务器端的图片名
private String targetName; //保存在本地时的图片名 public String getTargetName() {
return targetName;
}
public URL getUrl() {
return url;
} public Image(String url) {
try {
this.url = new URL(url);
initImageName();
targetName = imageName;
} catch (MalformedURLException ex) {
System.out.println("URL is invalid");
}
} public void saveAsFile(String path) {
BufferedInputStream in = null;
FileOutputStream out = null;
String separator = File.separator;
File fPath = null;
String sFile = null;
byte[] buf = new byte[8096]; // 缓冲区大小
int size = 0;
//检查输入并创建文件目录
if (null != path && !"".equals(path)) {
fPath = new File(path + separator + "images");
if (!fPath.exists()) {
fPath.mkdirs();
}
}
//这里的全路径不能用File,不然的话会出错,因为File new了之后会创建一个文件夹
sFile = fPath.getPath() + separator + targetName;
File file = new File(sFile);
if(file.exists()){
System.out.println("File : "+ sFile +" has exists!");
return ;
}
try {
URLConnection URLcon = url.openConnection();
in = new BufferedInputStream(URLcon.getInputStream());
out = new FileOutputStream(sFile);
while ((size = in.read(buf)) != -1) {
out.write(buf, 0, size);
}
out.close();
in.close();
} catch (IOException ex) {
System.err.println("IO Error about:" + sFile);
}
System.out.println("Save Image OK : " +sFile);
}
private void initImageName(){
String strUrl = url.getFile();
int i = strUrl.lastIndexOf("/");
imageName = strUrl.substring(i+1);
if(imageName.endsWith(".asp")||imageName.endsWith(".jsp")){
imageName = imageName.substring(0, imageName.length()-4);
imageName = imageName + ".gif";
}
}
}
package pagecapture;
import java.io.*;
import java.net.*;
import java.util.ArrayList;
import java.util.List;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;public class Page {
private String encoding = "utf-8";
private URL url;
private String pageName; //The original name in the server side
private String targetName ; //The name you want when you save the page as a files
private String content;
private List<String> linkedList = new ArrayList();
private List<Image> imageList = new ArrayList();
//Also can add css,javascript support in the furture
void init(){
System.out.println("Begin Init...");
initContent ();
initPageName();
initLinks();
initImages();
System.out.println("End Init...");
}
//Constructors
public Page(URL url) {
this.url = url;
init();
}
public Page(String url) {
try {
this.url = new URL(url);
init();
} catch (MalformedURLException ex) {
System.out.println("URL is invalid");
}
}
public Page(String url,String encoding){
this.encoding = encoding;
try {
this.url = new URL(url);
init();
} catch (MalformedURLException ex) {
System.out.println("URL is invalid");
}
}//All Public method
public String getEncoding() {
return encoding;
}
public void setEncoding(String encoding) {
this.encoding = encoding;
}
public String getContent() {
return content;
}
public URL getUrl() {
return url;
}
public String getPageName() {
return pageName;
}
public List<Image> getImageList() {
return imageList;
}
public List<String> getLinkedList() {
return linkedList;
}
public void SaveAsFile(String path) throws Exception {
System.out.println("Begin Save as a file... :" + targetName);
//创建目标目录
File fpath = new File(path);
fpath.mkdirs();
SaveAllImages(path);
//这里要从CONTENT中输出文件,因为这样才可以更改内容
BufferedReader in = new BufferedReader(new StringReader(content));
try {
PrintWriter out = new PrintWriter(
new BufferedWriter(
new FileWriter(path+ File.separator + targetName)));
String inputLine;
while ((inputLine = in.readLine()) != null) {
String strLine = System.getProperty("line.separator");
out.println(inputLine + strLine);
}
out.close();
} catch (EOFException e) {
System.err.println("End of stream save as a file:"+targetName);
}
in.close();
System.out.println("Save File Success! On :" + path);
}
private void SaveAllImages(String path) {
String origSrc;
String nowSrc;
for (Image image : imageList) {
image.saveAsFile(path);
//把content中的图片绝对路径转换为下载后的相对路径
origSrc = image.getUrl().toString();
nowSrc = "images/"+image.getTargetName();
content = content.replaceAll(origSrc, nowSrc);
}
}
//All private method
private void initContent(){
try {
StringBuffer sb = new StringBuffer(); //As a container to contain the content
URLConnection conn = url.openConnection();
conn.setDoOutput(true);
//我不知道这里要设定什么编码了,郁闷(BIG5反而正常了?)
BufferedReader in = new BufferedReader(
new InputStreamReader(conn.getInputStream(), encoding));
String inputLine;
while ((inputLine = in.readLine()) != null) {
sb.append(inputLine);
sb.append("\n");
// System.out.println(inputLine);
}
in.close();
content = sb.toString();
System.out.println("End of initContent!");
} catch (IOException ex) {
System.out.println("Can't connect to the URL!");
}
}
private void initPageName(){
String strUrl = url.getFile();
if ( strUrl.endsWith(".html")|| strUrl.endsWith(".htm")){
int i = strUrl.lastIndexOf("/");
pageName = strUrl.substring(i+1);
}else{
pageName = "index.html";
}
targetName = pageName;
System.out.println( "End of initPageName!The name is :"+targetName);
}
private void initImages(){
try {
Parser p = new Parser(content);
p.setEncoding(encoding);
String filterType = "img";
NodeFilter filer = new TagNameFilter(filterType);
NodeList nl = p.extractAllNodesThatMatch(filer);
for (int i = 0; i < nl.size(); i++) {
ImageTag imageTag = (ImageTag) nl.elementAt(i);
String imageUrl = imageTag.getImageURL();
System.out.println("Image-"+i+" : "+imageUrl);
imageList.add(new Image(imageUrl));
}
} catch (ParserException ex) {
System.out.println("Parse Image Tag Error!");
}
System.out.println("End of initImage!");
}
private void initLinks(){
try {
//Parser html source code to get all linked list in
Parser parser;
NodeList nodelist;
parser = Parser.createParser(content, encoding);
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[]{linkFilter});
nodelist = parser.parse(lastFilter);
Node[] nodes = nodelist.toNodeArray();
String line = "";
for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i];
LinkTag link = (LinkTag) node;
line = link.getLink();
if (isTrimEmpty(line)) {
continue;
}
if (!line.startsWith("http://")&&!line.equals("#")&&!line.equals("index.html")&&(line.endsWith(".html")||line.endsWith(".htm"))) {
String sUrl = url.toURI().toString();
line = sUrl.substring(0, sUrl.length()-pageName.length())+line;
System.out.println("Link-"+i+" : "+line);
linkedList.add(line);
}
}
} catch (URISyntaxException ex) {
System.out.println("Can't change the URL to URI!");
} catch (ParserException ex) {
System.out.println("Parse Links Tag Error!");
}
System.out.println("End of initLinks!");
}
private boolean isTrimEmpty(String astr) {
if ((null == astr) || (astr.length() == 0)) {
return true;
}
if (isBlank(astr.trim())) {
return true;
}
return false;
}
private boolean isBlank(String astr) {
if ((null == astr) || (astr.length() == 0)) {
return true;
} else {
return false;
}
}
}
String sUrl = "http://www.510book.cn/files/article/html/3/3382/index.html";
String dest = "D:/PageCapture";
Page p = new Page(sUrl,"BIG5");
p.SaveAsFile(dest);
List<String> l = p.getLinkedList();
for (String s : l) {
System.out.println("");
System.out.println("-------------------" + s + "----------------");
Page d = new Page(s);
d.SaveAsFile(dest);
}
}
}比较粗糙,不对应该是十分粗糙,我主要就用它下下起点的盗贴(如果是小说迷,又只能在公司上网的话一定有一样的需求)