/*
* Created on 2003/09/03
*
* To change the template for this generated file go to
* Window>Preferences>Java>Code Generation>Code and Comments
*/
package com.tide.html2jsp;import java.io.*;
import java.util.*;
import java.net.*;import javax.swing.text.*;
import javax.swing.text.html.*;
import javax.swing.text.html.HTML.Tag;/**
* @author wx
*
* To change the template for this generated type comment go to
* Window>Preferences>Java>Code Generation>Code and Comments
*/public class AnalyHtmlDoc { public static void main(String[] args) {
HTMLDocument doc = new HTMLDocument();
FileInputStream stream = null;
List result = new ArrayList();
try {
//ファイルを読む
Reader rd =
new InputStreamReader(new FileInputStream("GCM3-0103.html"));
StringBuffer bf = new StringBuffer();
// Parse the HTML
EditorKit kit = new HTMLEditorKit();
doc = (HTMLDocument) kit.createDefaultDocument();
kit.read(rd, doc, 0);
// Find all the A elements in the HTML document Tag[] tags = HTML.getAllTags();
for (int i = 0; i < tags.length; i++) {
HTMLDocument.Iterator it = doc.getIterator(tags[i]);
System.out.println("----"+tags[i]);
if (it != null){
while (it.isValid()) {
System.out.println("---------------------------"+tags[i]);
SimpleAttributeSet s =
(SimpleAttributeSet) it.getAttributes(); if (s != null) {
String link =
(String) s.getAttribute(HTML.Attribute.TYPE);
if (link != null) {
// Add the link to the result list
result.add(link);
System.out.println(link); }
}
it.next();
}
} }
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (BadLocationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} }
public static String getText(String uriStr) {
final StringBuffer buf = new StringBuffer(1000); try {
// Create an HTML document that appends all text to buf
HTMLDocument doc = new HTMLDocument() {
public HTMLEditorKit.ParserCallback getReader(int pos) {
return new HTMLEditorKit.ParserCallback() {
// This method is whenever text is encountered in the HTML file
public void handleText(char[] data, int pos) {
buf.append(data);
buf.append('\n');
}
};
}
}; // Create a reader on the HTML content
URL url = new URI(uriStr).toURL();
URLConnection conn = url.openConnection();
Reader rd = new InputStreamReader(conn.getInputStream()); // Parse the HTML
EditorKit kit = new HTMLEditorKit();
kit.read(rd, doc, 0);
} catch (MalformedURLException e) {
} catch (URISyntaxException e) {
} catch (BadLocationException e) {
} catch (IOException e) {
} // Return the text
return buf.toString();
} public static String[] getLinks(String uriStr) {
List result = new ArrayList(); try {
// Create a reader on the HTML content
URL url = new URI(uriStr).toURL();
URLConnection conn = url.openConnection();
Reader rd = new InputStreamReader(conn.getInputStream()); // Parse the HTML
EditorKit kit = new HTMLEditorKit();
HTMLDocument doc = (HTMLDocument) kit.createDefaultDocument();
kit.read(rd, doc, 0); // Find all the A elements in the HTML document
HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A);
while (it.isValid()) {
SimpleAttributeSet s = (SimpleAttributeSet) it.getAttributes(); String link = (String) s.getAttribute(HTML.Attribute.HREF);
if (link != null) {
// Add the link to the result list
result.add(link);
}
it.next();
}
} catch (MalformedURLException e) {
} catch (URISyntaxException e) {
} catch (BadLocationException e) {
} catch (IOException e) {
} // Return all found links
return (String[]) result.toArray(new String[result.size()]);
}
}
* Created on 2003/09/03
*
* To change the template for this generated file go to
* Window>Preferences>Java>Code Generation>Code and Comments
*/
package com.tide.html2jsp;import java.io.*;
import java.util.*;
import java.net.*;import javax.swing.text.*;
import javax.swing.text.html.*;
import javax.swing.text.html.HTML.Tag;/**
* @author wx
*
* To change the template for this generated type comment go to
* Window>Preferences>Java>Code Generation>Code and Comments
*/public class AnalyHtmlDoc { public static void main(String[] args) {
HTMLDocument doc = new HTMLDocument();
FileInputStream stream = null;
List result = new ArrayList();
try {
//ファイルを読む
Reader rd =
new InputStreamReader(new FileInputStream("GCM3-0103.html"));
StringBuffer bf = new StringBuffer();
// Parse the HTML
EditorKit kit = new HTMLEditorKit();
doc = (HTMLDocument) kit.createDefaultDocument();
kit.read(rd, doc, 0);
// Find all the A elements in the HTML document Tag[] tags = HTML.getAllTags();
for (int i = 0; i < tags.length; i++) {
HTMLDocument.Iterator it = doc.getIterator(tags[i]);
System.out.println("----"+tags[i]);
if (it != null){
while (it.isValid()) {
System.out.println("---------------------------"+tags[i]);
SimpleAttributeSet s =
(SimpleAttributeSet) it.getAttributes(); if (s != null) {
String link =
(String) s.getAttribute(HTML.Attribute.TYPE);
if (link != null) {
// Add the link to the result list
result.add(link);
System.out.println(link); }
}
it.next();
}
} }
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (BadLocationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} }
public static String getText(String uriStr) {
final StringBuffer buf = new StringBuffer(1000); try {
// Create an HTML document that appends all text to buf
HTMLDocument doc = new HTMLDocument() {
public HTMLEditorKit.ParserCallback getReader(int pos) {
return new HTMLEditorKit.ParserCallback() {
// This method is whenever text is encountered in the HTML file
public void handleText(char[] data, int pos) {
buf.append(data);
buf.append('\n');
}
};
}
}; // Create a reader on the HTML content
URL url = new URI(uriStr).toURL();
URLConnection conn = url.openConnection();
Reader rd = new InputStreamReader(conn.getInputStream()); // Parse the HTML
EditorKit kit = new HTMLEditorKit();
kit.read(rd, doc, 0);
} catch (MalformedURLException e) {
} catch (URISyntaxException e) {
} catch (BadLocationException e) {
} catch (IOException e) {
} // Return the text
return buf.toString();
} public static String[] getLinks(String uriStr) {
List result = new ArrayList(); try {
// Create a reader on the HTML content
URL url = new URI(uriStr).toURL();
URLConnection conn = url.openConnection();
Reader rd = new InputStreamReader(conn.getInputStream()); // Parse the HTML
EditorKit kit = new HTMLEditorKit();
HTMLDocument doc = (HTMLDocument) kit.createDefaultDocument();
kit.read(rd, doc, 0); // Find all the A elements in the HTML document
HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A);
while (it.isValid()) {
SimpleAttributeSet s = (SimpleAttributeSet) it.getAttributes(); String link = (String) s.getAttribute(HTML.Attribute.HREF);
if (link != null) {
// Add the link to the result list
result.add(link);
}
it.next();
}
} catch (MalformedURLException e) {
} catch (URISyntaxException e) {
} catch (BadLocationException e) {
} catch (IOException e) {
} // Return all found links
return (String[]) result.toArray(new String[result.size()]);
}
}
javax.swing.text.ChangedCharSetException
.......
.......好像是字符集的问题,有我加上去的那段代码就可以把页面中的所有标记打印出来,还有一个问题,如果把输入流改为,url.openStream();,程序没有任何反应也没有异常输出。
new InputStreamReader(new URL("http://www.yahoo.com").openStream());/*
* Created on 2003/09/03
*/import java.io.*;
import java.util.*;
import java.net.*;import javax.swing.text.*;
import javax.swing.text.html.*;
import javax.swing.text.html.HTML.Tag;/**
* @author wx
*/public class AnalyHtmlDoc { public static void main(String[] args) {
HTMLDocument doc = new HTMLDocument();
FileInputStream stream = null;
List result = new ArrayList();
try {
//ファイルを読む
Reader rd =
new InputStreamReader(new FileInputStream("c:\\yahoo.htm"));
//new URL("http://www.yahoo.com").openStream());
StringBuffer bf = new StringBuffer();
// Parse the HTML
EditorKit kit = new HTMLEditorKit();
doc = (HTMLDocument) kit.createDefaultDocument(); //我加上去的代码
BufferedReader brd = new BufferedReader(rd);
String a = "";
while ((a = brd.readLine()) != null) {
//System.out.println(a);
}
//附加代码结束 kit.read(rd, doc, 0);
// Find all the A elements in the HTML document Tag[] tags = HTML.getAllTags();
for (int i = 0; i < tags.length; i++) {
HTMLDocument.Iterator it = doc.getIterator(tags[i]);
System.out.println("----" + tags[i]);
if (it != null) {
while (it.isValid()) {
System.out.println(
"---------------------------" + tags[i]);
SimpleAttributeSet s =
(SimpleAttributeSet) it.getAttributes(); if (s != null) {
String link =
(String) s.getAttribute(HTML.Attribute.TYPE);
if (link != null) {
// Add the link to the result list
result.add(link);
System.out.println(link); }
}
it.next();
}
} }
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
e.printStackTrace();
} catch (BadLocationException e) {
e.printStackTrace();
} }
public static String getText(String uriStr) {
final StringBuffer buf = new StringBuffer(1000); try {
// Create an HTML document that appends all text to buf
HTMLDocument doc = new HTMLDocument() {
public HTMLEditorKit.ParserCallback getReader(int pos) {
return new HTMLEditorKit.ParserCallback() {
// This method is whenever text is encountered in the HTML file
public void handleText(char[] data, int pos) {
buf.append(data);
buf.append('\n');
}
};
}
}; // Create a reader on the HTML content
URL url = new URI(uriStr).toURL();
URLConnection conn = url.openConnection();
Reader rd = new InputStreamReader(conn.getInputStream()); // Parse the HTML
EditorKit kit = new HTMLEditorKit();
kit.read(rd, doc, 0);
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (URISyntaxException e) {
e.printStackTrace();
} catch (BadLocationException e) {
e.printStackTrace();
} catch (IOException e) {
} // Return the text
return buf.toString();
} public static String[] getLinks(String uriStr) {
List result = new ArrayList(); try {
// Create a reader on the HTML content
URL url = new URI(uriStr).toURL();
URLConnection conn = url.openConnection();
Reader rd = new InputStreamReader(conn.getInputStream()); // Parse the HTML
EditorKit kit = new HTMLEditorKit();
HTMLDocument doc = (HTMLDocument) kit.createDefaultDocument();
kit.read(rd, doc, 0); // Find all the A elements in the HTML document
HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A);
while (it.isValid()) {
SimpleAttributeSet s = (SimpleAttributeSet) it.getAttributes(); String link = (String) s.getAttribute(HTML.Attribute.HREF);
if (link != null) {
// Add the link to the result list
result.add(link);
}
it.next();
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (URISyntaxException e) {
e.printStackTrace();
} catch (BadLocationException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} // Return all found links
return (String[]) result.toArray(new String[result.size()]);
}
}
不过去掉metadata中的编码设置就没有问题了
不知道我加上去的那一段对它有什么影响,只是一种现象
刚刚看了你的贴子,重新去搜索,终于找到答案URL url = new URL("http://www.sina.com.cn/");
HTMLDocument doc = new HTMLDocument();
doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);
HTMLEditorKit kit = new HTMLEditorKit();
InputStreamReader in = new InputStreamReader(url.openStream());
kit.read(in, doc, 0);
in.close();
這個方法