【精品问题】有关JDK1.4中有HTMLDocument用来解析Html文档的

import java.net.*;
import java.io.*;public class GetHTML {
public static void main(String args[]){
if (args.length < 1){
System.out.println("USAGE: java GetHTML httpaddress");
System.exit(1);
}
String sURLAddress = new String(args[0]);
URL    url = null;
try{
   url = new URL(sURLAddress);
}catch(MalformedURLException e){
   System.err.println(e.toString());
                   System.exit(1);
}
try{
                   InputStream ins = url.openStream();
   BufferedReader breader = new BufferedReader(new InputStreamReader(ins));
                   String info = breader.readLine();
                   while(info != null){
                        System.out.println(info);
                        info  = breader.readLine();
   }
}
                catch(IOException e){
   System.err.println(e.toString());
                   System.exit(1);
}
}
}

解决方案 »

免费领取超大流量手机卡，每月29元包185G流量+100分钟通话, 中国电信官方发货

1.Getting the Text in an HTML Document    // This method takes a URI which can be either a filename (e.g. file://c:/dir/file.html)
    // or a URL (e.g. http://host.com/page.html) and returns all text in the document.
    public static String getText(String uriStr) {
        final StringBuffer buf = new StringBuffer(1000);

        try {
            // Create an HTML document that appends all text to buf
            HTMLDocument doc = new HTMLDocument() {
                public HTMLEditorKit.ParserCallback getReader(int pos) {
                    return new HTMLEditorKit.ParserCallback() {
                        // This method is whenever text is encountered in the HTML file
                        public void handleText(char[] data, int pos) {
                            buf.append(data);
                            buf.append('\n');
                        }
                    };
                }
            };

            // Create a reader on the HTML content
            URL url = new URI(uriStr).toURL();
            URLConnection conn = url.openConnection();
            Reader rd = new InputStreamReader(conn.getInputStream());

            // Parse the HTML
            EditorKit kit = new HTMLEditorKit();
            kit.read(rd, doc, 0);
        } catch (MalformedURLException e) {
        } catch (URISyntaxException e) {
        } catch (BadLocationException e) {
        } catch (IOException e) {
        }

        // Return the text
        return buf.toString();
    }
2.Getting the Links in an HTML Document
    // This method takes a URI which can be either a filename (e.g. file://c:/dir/file.html)
    // or a URL (e.g. http://host.com/page.html) and returns all HREF links in the document.
    public static String[] getLinks(String uriStr) {
        List result = new ArrayList();

        try {
            // Create a reader on the HTML content
            URL url = new URI(uriStr).toURL();
            URLConnection conn = url.openConnection();
            Reader rd = new InputStreamReader(conn.getInputStream());

            // Parse the HTML
            EditorKit kit = new HTMLEditorKit();
            HTMLDocument doc = (HTMLDocument)kit.createDefaultDocument();
            kit.read(rd, doc, 0);

            // Find all the A elements in the HTML document
            HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A);
            while (it.isValid()) {
                SimpleAttributeSet s = (SimpleAttributeSet)it.getAttributes();

                String link = (String)s.getAttribute(HTML.Attribute.HREF);
                if (link != null) {
                    // Add the link to the result list
                    result.add(link);
                }
                it.next();
            }
        } catch (MalformedURLException e) {
        } catch (URISyntaxException e) {
        } catch (BadLocationException e) {
        } catch (IOException e) {
        }

        // Return all found links
        return (String[])result.toArray(new String[result.size()]);
    } Related Examples