如何抓取网页

private static String getStaticPage(String surl) {
        String htmlContent = "";
        try {
            java.io.InputStream inputStream;
            java.net.URL url = new java.net.URL(surl);
            java.net.HttpURLConnection connection = (java.net.HttpURLConnection) url
                    .openConnection();
            connection.connect();
            inputStream = connection.getInputStream();
            byte[] bytes = new byte[1024 * 2000];
            int index = 0;
            int count = inputStream.read(bytes, index, 1024 * 2000);
            while (count != -1) {
                index += count;
                count = inputStream.read(bytes, index, 1);
            }
            htmlContent = new String(bytes, "UTF-8");
            connection.disconnect();
        } catch (Exception ex) {
            ex.printStackTrace();
        }
        return htmlContent.trim();
    }    public static void main(String[] args) {
        try {
            String src = getStaticPage("http://www.google.com");
            File file = new File("d:\\aa.html");
            FileWriter resultFile = new FileWriter(file);
            PrintWriter myFile = new PrintWriter(resultFile);// 写文件
            myFile.println(src);
            resultFile.close();
            myFile.close();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }自己修改下。

使用jsoup.
一个分析dzone网站联接的例子
/**
*
*/
package com.linkwithweb.parser;import java.io.File;import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;/****************************************************************
* Description
* jsoup elements support a CSS (or jquery) like selector syntax to find matching elements, that allows very powerful and robust queries.
*
* The select method is available in a Document, Element, or in Elements. It is contextual, so you can filter by selecting from a specific element, or
* by chaining select calls.
*
* Select returns a list of Elements (as Elements), which provides a range of methods to extract and manipulate the results.
*
* Selector overview
* tagname: find elements by tag, e.g. a
* ns|tag: find elements by tag in a namespace, e.g. fb|name finds <fb:name> elements
* #id: find elements by ID, e.g. #logo
* .class: find elements by class name, e.g. .masthead
* [attribute]: elements with attribute, e.g. [href]
* [^attr]: elements with an attribute name prefix, e.g. [^data-] finds elements with HTML5 dataset attributes
* [attr=value]: elements with attribute value, e.g. [width=500]
* [attr^=value], [attr$=value], [attr*=value]: elements with attributes that start with, end with, or contain the value, e.g. [href*=/path/]
* [attr~=regex]: elements with attribute values that match the regular expression; e.g. img[src~=(?i)\.(png|jpe?g)]
* : all elements, e.g. *
* Selector combinations
* el#id: elements with ID, e.g. div#logo
* el.class: elements with class, e.g. div.masthead
* el[attr]: elements with attribute, e.g. a[href]
* Any combination, e.g. a[href].highlight
* ancestor child: child elements that descend from ancestor, e.g. .body p finds p elements anywhere under a block with class "body"
* parent > child: child elements that descend directly from parent, e.g. div.content > p finds p elements; and body > * finds the direct children of
* the body tag
* siblingA + siblingB: finds sibling B element immediately preceded by sibling A, e.g. div.head + div
* siblingA ~ siblingX: finds sibling X element preceded by sibling A, e.g. h1 ~ p
* el, el, el: group multiple selectors, find unique elements that match any of the selectors; e.g. div.masthead, div.logo
* Pseudo selectors
* :lt(n): find elements whose sibling index (i.e. its position in the DOM tree relative to its parent) is less than n; e.g. td:lt(3)
* :gt(n): find elements whose sibling index is greater than n; e.g. div p:gt(2)
* :eq(n): find elements whose sibling index is equal to n; e.g. form input:eq(1)
* :has(seletor): find elements that contain elements matching the selector; e.g. div:has(p)
* :not(selector): find elements that do not match the selector; e.g. div:not(.logo)
* :contains(text): find elements that contain the given text. The search is case-insensitive; e.g. p:contains(jsoup)
* :containsOwn(text): find elements that directly contain the given text
* :matches(regex): find elements whose text matches the specified regular expression; e.g. div:matches((?i)login)
* :matchesOwn(regex): find elements whose own text matches the specified regular expression
* Note that the above indexed pseudo-selectors are 0-based, that is, the first element is at index 0, the second at 1, etc
* See the Selector API reference for the full supported list and details.
*
* @author Ashwin Kumar
*
*/
public class HTMLParser { /**
* @param args
*/
public static void main(String[] args) {
try {
File input = new File("input/dZoneLinks.xml");
Document doc = Jsoup.parse(input, "UTF-8",
"http://www.dzone.com/links/?type=html&p=2"); Elements descriptions = doc.select("div.details > p.description"); // get all description elements in this HTML file
/*
* Elements pngs = doc.select("img[src$=.png]");
* // img with src ending .png
*
* Element masthead = doc.select("div.masthead").first();
*/
// div with // Elements resultLinks = doc.select("h3.r > a"); // direct a after h3
/**
* Iterate over all descriptions and display them
*/
for (Element element : descriptions) {
System.out.println(element.ownText());
System.out.println("--------------");
} } catch (Exception e) {
e.printStackTrace();
}
}}

用火狐就可以看到源码呀，在源码中可以点击链接打开关联的js、css、html、asp。代码，如果要抓取整个网站的话，就要下载专业的抓取工具了。去google一下。

你可以用正则表达式和java.net.URL中的方法
来爬网页很简单的   public String getPageContent(String strUrl, String strPostRequest,
            int maxLength) {
        // 读取结果网页
        StringBuffer buffer = new StringBuffer();
        System.setProperty("sun.net.client.defaultConnectTimeout", "5000");
        System.setProperty("sun.net.client.defaultReadTimeout", "5000");
        try {
            URL newUrl = new URL(strUrl);
            HttpURLConnection hConnect = (HttpURLConnection) newUrl
                    .openConnection();
            // POST方式的额外数据
            if (strPostRequest.length() > 0) {
                hConnect.setDoOutput(true);
                OutputStreamWriter out = new OutputStreamWriter(hConnect
                        .getOutputStream());
                out.write(strPostRequest);
                out.flush();
                out.close();
            }
            // 读取内容

            BufferedReader rd = new BufferedReader(new InputStreamReader(
                    hConnect.getInputStream()));
            int ch;
            for (int length = 0; (ch = rd.read()) > -1
                    && (maxLength <= 0 || length < maxLength); length++)
                buffer.append((char) ch);
            String s = buffer.toString();
            s.replaceAll("\\&[a-zA-Z]{1,10};", "").replaceAll("<[^>]*>", "");
            System.out.println(s);

            rd.close();
            hConnect.disconnect();
            return buffer.toString().trim();
        } catch (Exception e) {
            // return "错误:读取网页失败！";
            //
            return null;
        }
    }

调试易

如何抓取网页

解决方案 »