/* (c) Copyright IBM Corp. 1999 All rights reserved. This sample program is owned by International Business Machines Corporation or one of its subsidiaries ("IBM") and is copyrighted and licensed, not sold. You may copy, modify, and distribute this sample program in any form without payment to IBM, for any purpose including developing, using, eting or distributing programs that include or are derivative works of the sample program. The sample program is provided to you on an "AS IS" basis, without warranty of any kind. IBM HEREBY EXPRESSLY DISCLAIMS ALL WARRANTIES, EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Some jurisdictions do not allow for the exclusion or limitation of implied warranties, so the above limitations or exclusions may not apply to you. IBM shall not be liable for any damages you suffer as a result of using, modifying or distributing the sample program or its derivatives. Each copy of any portion of this sample program or any derivative work, must include the above copyright notice and disclaimer of warranty. */import java.io.*; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.lang.Exception;/* This is a miniature line-mode browser. It will connect to a URL provided on the command line, and return the contents of the page. It is good for testing servlets, as well.format: java miniBrowse <URL> <raw|cooked>Raw mode will just dump the HTML datastream back to the screen. Cooked mode does some rudimentary parsing. It defaults to "cooked" mode, just to not clutter up the screen as much.*/public class miniBrowse { static URL addr = null; static String method = null; miniBrowse(String pageURL) { BufferedReader in = null; PrintWriter out = new PrintWriter(new BufferedWriter(new OutputStreamWriter( System.out))); try { addr = new URL(pageURL); } catch (MalformedURLException e) { out.println("Invalid URL."); System.exit(4); } try { URLConnection page = addr.openConnection(); in = new BufferedReader( new InputStreamReader(page.getInputStream())); if (method.equalsIgnoreCase("cooked")) { cookedOutput(in, out); } else { rawOutput(in, out); } } catch (Exception e) { } } public void cookedOutput(BufferedReader in, PrintWriter out) { int chr = ' '; int pchr = ' '; boolean skip = true; String tag = ""; String list = ""; int lcount = 0; try { while ((chr = in.read()) != -1) { /* found an HTML tag, let's extract it */ if (chr == '<') { tag = ""; while ((chr = in.read()) != '>') { if (chr == -1) { out.println("Missing '>'"); System.exit(8); } tag = tag + Character.toUpperCase((char)chr); } /* some basic tag processing (default is skip quietly) notice we skip the entire HEAD section!! */ if (tag.startsWith("BODY")) { skip = false; } if (tag.equals("BR")) { out.println(""); } if (tag.equals("HR")) { out.println("\n-------------"); } /* lists, ordered and unordered - new line and indent each member (ordered list numbers members) */ if (tag.equals("UL") || tag.equals("OL")) { out.println(""); list = tag; lcount = 0; } if (tag.equals("LI")) { if (list.equals("OL")) { out.print("\n" + ++lcount); } else { out.print("\n-- "); } } if (tag.startsWith("A ")) { out.print("<Link>"); } if (tag.startsWith("IMG")) { out.print("<Image>"); } if (tag.startsWith("APPLET")) { out.print("<Java>"); } if (tag.startsWith("SCRIPT")) { out.print("<javascript>"); } if (tag.equals("P")) { out.println("\n\n"); } continue; } if (!skip) { /* don't echo excess spaces */ if (chr != ' ') { out.write(chr); } else if (pchr != ' ') { out.write(chr); } pchr = chr; } } out.flush(); } catch (IOException e) { out.println("I/O Exception."); } } public void rawOutput(BufferedReader in, PrintWriter out) { String inputLine; try { while ((inputLine = in.readLine()) != null) { out.println(inputLine); } } catch (IOException e) { out.println("I/O Exception."); } out.flush(); } public static void main(String[] args) { if (args.length > 1) { method = args[1]; } else { method = "cooked"; } if (args.length > 0) { new miniBrowse(args[0]); } } }
还有很多网页的标签不完整。
后来我在sourceforge.net上找到一个项目,叫做HtmlParser,这个包能完成转换Html的很多操作。
包括我想要的,不过转换后的效果还不是很让人满意,没有IE保存后的效果好。
(c) Copyright IBM Corp. 1999 All rights reserved.
This sample program is owned by International Business Machines Corporation or
one of its subsidiaries ("IBM") and is copyrighted and licensed, not sold. You may copy, modify, and distribute this sample program in any form without
payment to IBM, for any purpose including developing, using, eting or
distributing programs that include or are derivative works of the sample program. The sample program is provided to you on an "AS IS" basis, without warranty of
any kind. IBM HEREBY EXPRESSLY DISCLAIMS ALL WARRANTIES, EITHER
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. Some jurisdictions do not allow for the exclusion or limitation of
implied warranties, so the above limitations or exclusions may not apply to you.
IBM shall not be liable for any damages you suffer as a result of using, modifying
or distributing the sample program or its derivatives. Each copy of any portion of this sample program or any derivative work, must
include the above copyright notice and disclaimer of warranty.
*/import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.lang.Exception;/*
This is a miniature line-mode browser. It will connect to a URL provided on the command line, and return the
contents of the page. It is good for testing servlets, as well.format: java miniBrowse <URL> <raw|cooked>Raw mode will just dump the HTML datastream back to the screen. Cooked mode does some rudimentary parsing.
It defaults to "cooked" mode, just to not clutter up the screen as much.*/public class miniBrowse {
static URL addr = null;
static String method = null; miniBrowse(String pageURL) {
BufferedReader in = null;
PrintWriter out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(
System.out)));
try {
addr = new URL(pageURL);
} catch (MalformedURLException e) {
out.println("Invalid URL.");
System.exit(4);
} try {
URLConnection page = addr.openConnection();
in = new BufferedReader(
new InputStreamReader(page.getInputStream()));
if (method.equalsIgnoreCase("cooked")) {
cookedOutput(in, out);
} else {
rawOutput(in, out);
}
} catch (Exception e) {
}
} public void cookedOutput(BufferedReader in, PrintWriter out) {
int chr = ' ';
int pchr = ' ';
boolean skip = true;
String tag = "";
String list = "";
int lcount = 0; try {
while ((chr = in.read()) != -1) {
/*
found an HTML tag, let's extract it
*/
if (chr == '<') {
tag = "";
while ((chr = in.read()) != '>') {
if (chr == -1) {
out.println("Missing '>'");
System.exit(8);
}
tag = tag + Character.toUpperCase((char)chr);
}
/*
some basic tag processing (default is skip quietly)
notice we skip the entire HEAD section!!
*/ if (tag.startsWith("BODY")) {
skip = false;
} if (tag.equals("BR")) {
out.println("");
} if (tag.equals("HR")) {
out.println("\n-------------");
}
/*
lists, ordered and unordered - new line and indent
each member (ordered list numbers members)
*/
if (tag.equals("UL") || tag.equals("OL")) {
out.println("");
list = tag;
lcount = 0;
} if (tag.equals("LI")) {
if (list.equals("OL")) {
out.print("\n" + ++lcount);
} else {
out.print("\n-- ");
}
} if (tag.startsWith("A ")) {
out.print("<Link>");
}
if (tag.startsWith("IMG")) {
out.print("<Image>");
} if (tag.startsWith("APPLET")) {
out.print("<Java>");
}
if (tag.startsWith("SCRIPT")) {
out.print("<javascript>");
} if (tag.equals("P")) {
out.println("\n\n");
}
continue;
} if (!skip) {
/*
don't echo excess spaces
*/
if (chr != ' ') {
out.write(chr);
} else if (pchr != ' ') {
out.write(chr);
}
pchr = chr;
}
}
out.flush();
} catch (IOException e) {
out.println("I/O Exception.");
}
} public void rawOutput(BufferedReader in, PrintWriter out) {
String inputLine;
try {
while ((inputLine = in.readLine()) != null) {
out.println(inputLine);
}
} catch (IOException e) {
out.println("I/O Exception.");
}
out.flush();
} public static void main(String[] args) {
if (args.length > 1) {
method = args[1];
} else {
method = "cooked";
} if (args.length > 0) {
new miniBrowse(args[0]);
}
}
}