private static String getStaticPage(String surl) { String htmlContent = ""; try { java.io.InputStream inputStream; java.net.URL url = new java.net.URL(surl); java.net.HttpURLConnection connection = (java.net.HttpURLConnection) url .openConnection(); connection.connect(); inputStream = connection.getInputStream(); byte[] bytes = new byte[1024 * 2000]; int index = 0; int count = inputStream.read(bytes, index, 1024 * 2000); while (count != -1) { index += count; count = inputStream.read(bytes, index, 1); } htmlContent = new String(bytes, "UTF-8"); connection.disconnect(); } catch (Exception ex) { ex.printStackTrace(); } return htmlContent.trim(); } public static void main(String[] args) { try { String src = getStaticPage("http://www.google.com"); File file = new File("d:\\aa.html"); FileWriter resultFile = new FileWriter(file); PrintWriter myFile = new PrintWriter(resultFile);// 写文件 myFile.println(src); resultFile.close(); myFile.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }自己修改下。
使用jsoup. 一个分析dzone网站联接的例子 /** * */ package com.linkwithweb.parser;import java.io.File;import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements;/**************************************************************** * Description * jsoup elements support a CSS (or jquery) like selector syntax to find matching elements, that allows very powerful and robust queries. * * The select method is available in a Document, Element, or in Elements. It is contextual, so you can filter by selecting from a specific element, or * by chaining select calls. * * Select returns a list of Elements (as Elements), which provides a range of methods to extract and manipulate the results. * * Selector overview * tagname: find elements by tag, e.g. a * ns|tag: find elements by tag in a namespace, e.g. fb|name finds <fb:name> elements * #id: find elements by ID, e.g. #logo * .class: find elements by class name, e.g. .masthead * [attribute]: elements with attribute, e.g. [href] * [^attr]: elements with an attribute name prefix, e.g. [^data-] finds elements with HTML5 dataset attributes * [attr=value]: elements with attribute value, e.g. [width=500] * [attr^=value], [attr$=value], [attr*=value]: elements with attributes that start with, end with, or contain the value, e.g. [href*=/path/] * [attr~=regex]: elements with attribute values that match the regular expression; e.g. img[src~=(?i)\.(png|jpe?g)] * : all elements, e.g. * * Selector combinations * el#id: elements with ID, e.g. div#logo * el.class: elements with class, e.g. div.masthead * el[attr]: elements with attribute, e.g. a[href] * Any combination, e.g. a[href].highlight * ancestor child: child elements that descend from ancestor, e.g. .body p finds p elements anywhere under a block with class "body" * parent > child: child elements that descend directly from parent, e.g. div.content > p finds p elements; and body > * finds the direct children of * the body tag * siblingA + siblingB: finds sibling B element immediately preceded by sibling A, e.g. div.head + div * siblingA ~ siblingX: finds sibling X element preceded by sibling A, e.g. h1 ~ p * el, el, el: group multiple selectors, find unique elements that match any of the selectors; e.g. div.masthead, div.logo * Pseudo selectors * :lt(n): find elements whose sibling index (i.e. its position in the DOM tree relative to its parent) is less than n; e.g. td:lt(3) * :gt(n): find elements whose sibling index is greater than n; e.g. div p:gt(2) * :eq(n): find elements whose sibling index is equal to n; e.g. form input:eq(1) * :has(seletor): find elements that contain elements matching the selector; e.g. div:has(p) * :not(selector): find elements that do not match the selector; e.g. div:not(.logo) * :contains(text): find elements that contain the given text. The search is case-insensitive; e.g. p:contains(jsoup) * :containsOwn(text): find elements that directly contain the given text * :matches(regex): find elements whose text matches the specified regular expression; e.g. div:matches((?i)login) * :matchesOwn(regex): find elements whose own text matches the specified regular expression * Note that the above indexed pseudo-selectors are 0-based, that is, the first element is at index 0, the second at 1, etc * See the Selector API reference for the full supported list and details. * * @author Ashwin Kumar * */ public class HTMLParser { /** * @param args */ public static void main(String[] args) { try { File input = new File("input/dZoneLinks.xml"); Document doc = Jsoup.parse(input, "UTF-8", "http://www.dzone.com/links/?type=html&p=2"); Elements descriptions = doc.select("div.details > p.description"); // get all description elements in this HTML file /* * Elements pngs = doc.select("img[src$=.png]"); * // img with src ending .png * * Element masthead = doc.select("div.masthead").first(); */ // div with // Elements resultLinks = doc.select("h3.r > a"); // direct a after h3 /** * Iterate over all descriptions and display them */ for (Element element : descriptions) { System.out.println(element.ownText()); System.out.println("--------------"); } } catch (Exception e) { e.printStackTrace(); } }}
private static String getStaticPage(String surl) {
String htmlContent = "";
try {
java.io.InputStream inputStream;
java.net.URL url = new java.net.URL(surl);
java.net.HttpURLConnection connection = (java.net.HttpURLConnection) url
.openConnection();
connection.connect();
inputStream = connection.getInputStream();
byte[] bytes = new byte[1024 * 2000];
int index = 0;
int count = inputStream.read(bytes, index, 1024 * 2000);
while (count != -1) {
index += count;
count = inputStream.read(bytes, index, 1);
}
htmlContent = new String(bytes, "UTF-8");
connection.disconnect();
} catch (Exception ex) {
ex.printStackTrace();
}
return htmlContent.trim();
} public static void main(String[] args) {
try {
String src = getStaticPage("http://www.google.com");
File file = new File("d:\\aa.html");
FileWriter resultFile = new FileWriter(file);
PrintWriter myFile = new PrintWriter(resultFile);// 写文件
myFile.println(src);
resultFile.close();
myFile.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}自己修改下。
一个分析dzone网站联接的例子
/**
*
*/
package com.linkwithweb.parser;import java.io.File;import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;/****************************************************************
* Description
* jsoup elements support a CSS (or jquery) like selector syntax to find matching elements, that allows very powerful and robust queries.
*
* The select method is available in a Document, Element, or in Elements. It is contextual, so you can filter by selecting from a specific element, or
* by chaining select calls.
*
* Select returns a list of Elements (as Elements), which provides a range of methods to extract and manipulate the results.
*
* Selector overview
* tagname: find elements by tag, e.g. a
* ns|tag: find elements by tag in a namespace, e.g. fb|name finds <fb:name> elements
* #id: find elements by ID, e.g. #logo
* .class: find elements by class name, e.g. .masthead
* [attribute]: elements with attribute, e.g. [href]
* [^attr]: elements with an attribute name prefix, e.g. [^data-] finds elements with HTML5 dataset attributes
* [attr=value]: elements with attribute value, e.g. [width=500]
* [attr^=value], [attr$=value], [attr*=value]: elements with attributes that start with, end with, or contain the value, e.g. [href*=/path/]
* [attr~=regex]: elements with attribute values that match the regular expression; e.g. img[src~=(?i)\.(png|jpe?g)]
* : all elements, e.g. *
* Selector combinations
* el#id: elements with ID, e.g. div#logo
* el.class: elements with class, e.g. div.masthead
* el[attr]: elements with attribute, e.g. a[href]
* Any combination, e.g. a[href].highlight
* ancestor child: child elements that descend from ancestor, e.g. .body p finds p elements anywhere under a block with class "body"
* parent > child: child elements that descend directly from parent, e.g. div.content > p finds p elements; and body > * finds the direct children of
* the body tag
* siblingA + siblingB: finds sibling B element immediately preceded by sibling A, e.g. div.head + div
* siblingA ~ siblingX: finds sibling X element preceded by sibling A, e.g. h1 ~ p
* el, el, el: group multiple selectors, find unique elements that match any of the selectors; e.g. div.masthead, div.logo
* Pseudo selectors
* :lt(n): find elements whose sibling index (i.e. its position in the DOM tree relative to its parent) is less than n; e.g. td:lt(3)
* :gt(n): find elements whose sibling index is greater than n; e.g. div p:gt(2)
* :eq(n): find elements whose sibling index is equal to n; e.g. form input:eq(1)
* :has(seletor): find elements that contain elements matching the selector; e.g. div:has(p)
* :not(selector): find elements that do not match the selector; e.g. div:not(.logo)
* :contains(text): find elements that contain the given text. The search is case-insensitive; e.g. p:contains(jsoup)
* :containsOwn(text): find elements that directly contain the given text
* :matches(regex): find elements whose text matches the specified regular expression; e.g. div:matches((?i)login)
* :matchesOwn(regex): find elements whose own text matches the specified regular expression
* Note that the above indexed pseudo-selectors are 0-based, that is, the first element is at index 0, the second at 1, etc
* See the Selector API reference for the full supported list and details.
*
* @author Ashwin Kumar
*
*/
public class HTMLParser { /**
* @param args
*/
public static void main(String[] args) {
try {
File input = new File("input/dZoneLinks.xml");
Document doc = Jsoup.parse(input, "UTF-8",
"http://www.dzone.com/links/?type=html&p=2"); Elements descriptions = doc.select("div.details > p.description"); // get all description elements in this HTML file
/*
* Elements pngs = doc.select("img[src$=.png]");
* // img with src ending .png
*
* Element masthead = doc.select("div.masthead").first();
*/
// div with // Elements resultLinks = doc.select("h3.r > a"); // direct a after h3
/**
* Iterate over all descriptions and display them
*/
for (Element element : descriptions) {
System.out.println(element.ownText());
System.out.println("--------------");
} } catch (Exception e) {
e.printStackTrace();
}
}}
来爬网页 很简单的 public String getPageContent(String strUrl, String strPostRequest,
int maxLength) {
// 读取结果网页
StringBuffer buffer = new StringBuffer();
System.setProperty("sun.net.client.defaultConnectTimeout", "5000");
System.setProperty("sun.net.client.defaultReadTimeout", "5000");
try {
URL newUrl = new URL(strUrl);
HttpURLConnection hConnect = (HttpURLConnection) newUrl
.openConnection();
// POST方式的额外数据
if (strPostRequest.length() > 0) {
hConnect.setDoOutput(true);
OutputStreamWriter out = new OutputStreamWriter(hConnect
.getOutputStream());
out.write(strPostRequest);
out.flush();
out.close();
}
// 读取内容
BufferedReader rd = new BufferedReader(new InputStreamReader(
hConnect.getInputStream()));
int ch;
for (int length = 0; (ch = rd.read()) > -1
&& (maxLength <= 0 || length < maxLength); length++)
buffer.append((char) ch);
String s = buffer.toString();
s.replaceAll("\\&[a-zA-Z]{1,10};", "").replaceAll("<[^>]*>", "");
System.out.println(s);
rd.close();
hConnect.disconnect();
return buffer.toString().trim();
} catch (Exception e) {
// return "错误:读取网页失败!";
//
return null;
}
}