去站点:http://sourceforge.net/ 下载HtmlParser开源包,放到CLASSPATH.下面的代码可以提取html文件中的超链接: import org.htmlparser.HTMLNode; import org.htmlparser.HTMLParser; import org.htmlparser.tags.HTMLLinkTag; import org.htmlparser.util.HTMLEnumeration; import org.htmlparser.util.HTMLParserException;/** * LinkExtractor extracts all the links from the given webpage * and prints them on standard output. */ public class LinkExtractor { private String location; private HTMLParser parser; public LinkExtractor(String location) { this.location = location; try { this.parser = new HTMLParser(location); // Create the parser object parser.registerScanners(); // Register standard scanners (Very Important) } catch (HTMLParserException e) { e.printStackTrace(); }
} public void extractLinks() throws HTMLParserException { HTMLNode node; HTMLLinkTag linkTag; System.out.println("Parsing "+location+" for links..."); for (HTMLEnumeration e = parser.elements(); e.hasMoreNodes();) { node = e.nextHTMLNode(); // Get the next HTML Node if (node instanceof HTMLLinkTag) { linkTag = (HTMLLinkTag)node; // Downcast to a Link Tag linkTag.print(); // Print it } } } public static void main(String[] args) { if (args.length<0) { System.err.println("Syntax Error : Please provide the location(URL or file) to parse"); System.exit(-1); } LinkExtractor linkExtractor = new LinkExtractor(args[0]); try { linkExtractor.extractLinks(); } catch (HTMLParserException e) { e.printStackTrace(); } } }
import org.htmlparser.HTMLNode;
import org.htmlparser.HTMLParser;
import org.htmlparser.tags.HTMLLinkTag;
import org.htmlparser.util.HTMLEnumeration;
import org.htmlparser.util.HTMLParserException;/**
* LinkExtractor extracts all the links from the given webpage
* and prints them on standard output.
*/
public class LinkExtractor {
private String location;
private HTMLParser parser;
public LinkExtractor(String location) {
this.location = location;
try {
this.parser = new HTMLParser(location); // Create the parser object
parser.registerScanners(); // Register standard scanners (Very Important)
}
catch (HTMLParserException e) {
e.printStackTrace();
}
}
public void extractLinks() throws HTMLParserException {
HTMLNode node;
HTMLLinkTag linkTag;
System.out.println("Parsing "+location+" for links...");
for (HTMLEnumeration e = parser.elements(); e.hasMoreNodes();) {
node = e.nextHTMLNode(); // Get the next HTML Node
if (node instanceof HTMLLinkTag) {
linkTag = (HTMLLinkTag)node; // Downcast to a Link Tag
linkTag.print(); // Print it
}
}
}
public static void main(String[] args) {
if (args.length<0) {
System.err.println("Syntax Error : Please provide the location(URL or file) to parse");
System.exit(-1);
}
LinkExtractor linkExtractor = new LinkExtractor(args[0]);
try {
linkExtractor.extractLinks();
}
catch (HTMLParserException e) {
e.printStackTrace();
}
}
}
谢谢你,我试试
正好看到有处理这个的API,感觉用不找第三方的包,可惜那段代码理解不了
比如这段文字
<a href="">slfk</a>先提取<号,如果<后面跟着a ,在如果后面是href的四个字符,就把href里面的内容提取出来。
String line = "·<a href=\"http://bbs4.tom.com/r.php?forumid=443&postid=313\" target=_blank>[热贴]在日杀人我留学生判刑</a><br>";
line = line.toLowerCase();
String s = "href=";
System.out.println(line);
StringTokenizer st = new StringTokenizer(line, s);
String str = "";
int stCount = st.countTokens();
for (int i = 0; i < stCount; i++) {
str = st.nextToken();
System.out.println(str);
if (str.indexOf("http://") != -1) {
int start = str.indexOf("http://");
int end = str.indexOf(" ");
System.out.println(str.substring(start, end));
}
}
}不知道怎么做啊
------------------------------------------------------
if ((href == null) && (t == HTML.Tag.FRAME))
href = (String) a.getAttribute(HTML.Attribute.SRC);if (href == null)
return;
------------------------------------------------------
应该是判断HTML.Tag是否是URL。如果是:
截取开始到“#”的字符串,然后调用方法report.spiderFoundEMail(href);返回。不知道说的对不对。
* Find the word with regex
* @param str
* @param regexHead
* @param regexBody
* @param regexFoot
* @return
*/
public static List getStringByRegex(String str,String regexHead,String regexBody,String regexFoot){
List list = new ArrayList();
String strComplete = regexHead + regexBody + regexFoot;
Pattern pattern = Pattern.compile(strComplete);
Matcher matcher = pattern.matcher(str);
while(matcher.find()){
list.add(matcher.group());
}
List formatList = new ArrayList();
for(int i=0,j=list.size();i<j;i++){
String strTmp = (String)list.get(i);
String strFind = strTmp.replaceAll(regexHead,"").replaceAll(regexFoot,"");
if(strFind!=null){
formatList.add(strFind);
}
}
return formatList;
}
调用:
List list = getStringByRegex(str,"a=","[^>]",">");
具体没有测试过,不过,我经常这么用的