package mor.htm;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;public class Urls {
private String startUrl; // 开始采集网址 String urlContent; String ContentArea; private String strAreaBegin, strAreaEnd; // 采集区域开始采集字符串和结束采集字符串 @SuppressWarnings("unused")
private String stringInUrl, stringNotInUrl; String strContent;// 获得的采集内容 String[] allUrls; // 采集到的所有网址 private String regex; // 采集规则 UrlAndTitle urlAndTitle = new UrlAndTitle(); // 存储网址和标题 public static void main(String[] args)
{
Urls myurl = new Urls("body", "/body");
myurl.getStartUrl("http://www.godo.cc/product/list/3.shtml");
myurl.getUrlContent();
myurl.getContentArea();
myurl.getStringInUrl("http://www.godo.cc/product/list/3.shtml");
myurl.getStringNotInUrl("google");
myurl.Urls(); } // 初始化构造函数 strAreaBegin 和strAreaEnd public Urls(String strAreaBegin, String strAreaEnd) {
this.strAreaBegin = strAreaBegin;
this.strAreaEnd = strAreaEnd;
} //
public void Urls()
{
int i = 0;
// String regex ="<a
// href="?'?http://[a-zA-Z0-9]+\.[a-zA-Z0-9]+\.[a-zA-Z]+/?[\.?[\S|\s]]+[a>]$";
final String regex = "<a.*?/a>";
// String regex ="http://.*?>";
final Pattern pt = Pattern.compile(regex);
final Matcher mt = pt.matcher(ContentArea);
while (mt.find()) {
i++; // 获取网址
final Matcher myurl = Pattern.compile("href=.*?>").matcher(mt.group());
while (myurl.find()) {
System.out.println("http://www.godo.cc"
+ myurl.group().replaceAll("href=|>", ""));
} } } // 获得开始采集网址
public void getStartUrl(String startUrl)
{
this.startUrl = startUrl;
} // 获得网址所在内容;
public void getUrlContent()
{ StringBuffer is = new StringBuffer();
try {
URL myUrl = new URL(startUrl);
BufferedReader br = new BufferedReader(new InputStreamReader(myUrl
.openStream())); String s;
while ((s = br.readLine()) != null) {
is.append(s);
}
urlContent = is.toString();
} catch (Exception e) {
System.out.println("网址文件未能输出");
e.printStackTrace();
} } // 获得网址所在的匹配区域部分
public void getContentArea()
{
int pos1 = 0, pos2 = 0;
pos1 = urlContent.indexOf(strAreaBegin) + strAreaBegin.length();
pos2 = urlContent.indexOf(strAreaEnd, pos1);
ContentArea = urlContent.substring(pos1, pos2);
}
public void getStringInUrl(String stringInUrl)
{
this.stringInUrl = stringInUrl; } public void getStringNotInUrl(String stringNotInUrl)
{
this.stringNotInUrl = stringNotInUrl;
} // 获取采集规则 // 获取url网址
public void getUrl()
{ } public String getRegex()
{
return regex; } class UrlAndTitle {
String myURL; String title;
}
}下面是结果
http://www.godo.cc"/" title="顾登商城"
http://www.godo.cc"/account/" class="fontred"
http://www.godo.cc"/account/register" class="fontred"
http://www.godo.cc"/orders/mycart"
http://www.godo.cc"/"
http://www.godo.cc"/brand"
http://www.godo.cc'/product/list/66.shtml'
http://www.godo.cc'/product/list/67.shtml'
http://www.godo.cc'/product/list/68.shtml'
http://www.godo.cc'/product/list/69.shtml'
http://www.godo.cc'/product/list/70.shtml'
http://www.godo.cc'/product/list/830.shtml'
http://www.godo.cc'/product/list/831.shtml'
http://www.godo.cc'/product/list/72.shtml'
http://www.godo.cc'/product/list/73.shtml'
http://www.godo.cc'/product/list/74.shtml'
http://www.godo.cc'/product/list/75.shtml'
http://www.godo.cc'/product/list/76.shtml'
http://www.godo.cc'/product/list/77.shtml'
http://www.godo.cc'/product/list/78.shtml'
http://www.godo.cc'/product/list/79.shtml'
http://www.godo.cc'/product/list/812.shtml'
http://www.godo.cc'/product/list/158.shtml'
http://www.godo.cc'/product/list/159.shtml'
http://www.godo.cc'/product/list/160.shtml'
http://www.godo.cc'/product/list/161.shtml'
http://www.godo.cc'/product/list/162.shtml'
http://www.godo.cc'/product/list/163.shtml'
http://www.godo.cc'/product/list/164.shtml'
http://www.godo.cc'/product/list/165.shtml'
http://www.godo.cc'/product/list/166.shtml'
http://www.godo.cc'/product/list/168.shtml'
http://www.godo.cc'/product/list/169.shtml'
http://www.godo.cc'/product/list/170.shtml'
http://www.godo.cc'/product/list/171.shtml'
http://www.godo.cc'/product/list/172.shtml'
http://www.godo.cc'/product/list/173.shtml'
http://www.godo.cc'/product/list/174.shtml'
http://www.godo.cc'/product/list/175.shtml'
http://www.godo.cc'/product/list/625.shtml'
http://www.godo.cc'/product/list/633.shtml'
http://www.godo.cc'/product/list/634.shtml'
http://www.godo.cc'/product/list/176.shtml'
http://www.godo.cc'/product/list/177.shtml'
http://www.godo.cc'/product/list/178.shtml'
http://www.godo.cc'/product/list/179.shtml'
http://www.godo.cc'/product/list/180.shtml'
http://www.godo.cc'/product/list/181.shtml'
http://www.godo.cc'/product/list/182.shtml'
http://www.godo.cc'/product/list/183.shtml'
http://www.godo.cc"/"
小弟就是想请教怎么去掉""符号和'符号啊 在线坐等大哥
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;public class Urls {
private String startUrl; // 开始采集网址 String urlContent; String ContentArea; private String strAreaBegin, strAreaEnd; // 采集区域开始采集字符串和结束采集字符串 @SuppressWarnings("unused")
private String stringInUrl, stringNotInUrl; String strContent;// 获得的采集内容 String[] allUrls; // 采集到的所有网址 private String regex; // 采集规则 UrlAndTitle urlAndTitle = new UrlAndTitle(); // 存储网址和标题 public static void main(String[] args)
{
Urls myurl = new Urls("body", "/body");
myurl.getStartUrl("http://www.godo.cc/product/list/3.shtml");
myurl.getUrlContent();
myurl.getContentArea();
myurl.getStringInUrl("http://www.godo.cc/product/list/3.shtml");
myurl.getStringNotInUrl("google");
myurl.Urls(); } // 初始化构造函数 strAreaBegin 和strAreaEnd public Urls(String strAreaBegin, String strAreaEnd) {
this.strAreaBegin = strAreaBegin;
this.strAreaEnd = strAreaEnd;
} //
public void Urls()
{
int i = 0;
// String regex ="<a
// href="?'?http://[a-zA-Z0-9]+\.[a-zA-Z0-9]+\.[a-zA-Z]+/?[\.?[\S|\s]]+[a>]$";
final String regex = "<a.*?/a>";
// String regex ="http://.*?>";
final Pattern pt = Pattern.compile(regex);
final Matcher mt = pt.matcher(ContentArea);
while (mt.find()) {
i++; // 获取网址
final Matcher myurl = Pattern.compile("href=.*?>").matcher(mt.group());
while (myurl.find()) {
System.out.println("http://www.godo.cc"
+ myurl.group().replaceAll("href=|>", ""));
} } } // 获得开始采集网址
public void getStartUrl(String startUrl)
{
this.startUrl = startUrl;
} // 获得网址所在内容;
public void getUrlContent()
{ StringBuffer is = new StringBuffer();
try {
URL myUrl = new URL(startUrl);
BufferedReader br = new BufferedReader(new InputStreamReader(myUrl
.openStream())); String s;
while ((s = br.readLine()) != null) {
is.append(s);
}
urlContent = is.toString();
} catch (Exception e) {
System.out.println("网址文件未能输出");
e.printStackTrace();
} } // 获得网址所在的匹配区域部分
public void getContentArea()
{
int pos1 = 0, pos2 = 0;
pos1 = urlContent.indexOf(strAreaBegin) + strAreaBegin.length();
pos2 = urlContent.indexOf(strAreaEnd, pos1);
ContentArea = urlContent.substring(pos1, pos2);
}
public void getStringInUrl(String stringInUrl)
{
this.stringInUrl = stringInUrl; } public void getStringNotInUrl(String stringNotInUrl)
{
this.stringNotInUrl = stringNotInUrl;
} // 获取采集规则 // 获取url网址
public void getUrl()
{ } public String getRegex()
{
return regex; } class UrlAndTitle {
String myURL; String title;
}
}下面是结果
http://www.godo.cc"/" title="顾登商城"
http://www.godo.cc"/account/" class="fontred"
http://www.godo.cc"/account/register" class="fontred"
http://www.godo.cc"/orders/mycart"
http://www.godo.cc"/"
http://www.godo.cc"/brand"
http://www.godo.cc'/product/list/66.shtml'
http://www.godo.cc'/product/list/67.shtml'
http://www.godo.cc'/product/list/68.shtml'
http://www.godo.cc'/product/list/69.shtml'
http://www.godo.cc'/product/list/70.shtml'
http://www.godo.cc'/product/list/830.shtml'
http://www.godo.cc'/product/list/831.shtml'
http://www.godo.cc'/product/list/72.shtml'
http://www.godo.cc'/product/list/73.shtml'
http://www.godo.cc'/product/list/74.shtml'
http://www.godo.cc'/product/list/75.shtml'
http://www.godo.cc'/product/list/76.shtml'
http://www.godo.cc'/product/list/77.shtml'
http://www.godo.cc'/product/list/78.shtml'
http://www.godo.cc'/product/list/79.shtml'
http://www.godo.cc'/product/list/812.shtml'
http://www.godo.cc'/product/list/158.shtml'
http://www.godo.cc'/product/list/159.shtml'
http://www.godo.cc'/product/list/160.shtml'
http://www.godo.cc'/product/list/161.shtml'
http://www.godo.cc'/product/list/162.shtml'
http://www.godo.cc'/product/list/163.shtml'
http://www.godo.cc'/product/list/164.shtml'
http://www.godo.cc'/product/list/165.shtml'
http://www.godo.cc'/product/list/166.shtml'
http://www.godo.cc'/product/list/168.shtml'
http://www.godo.cc'/product/list/169.shtml'
http://www.godo.cc'/product/list/170.shtml'
http://www.godo.cc'/product/list/171.shtml'
http://www.godo.cc'/product/list/172.shtml'
http://www.godo.cc'/product/list/173.shtml'
http://www.godo.cc'/product/list/174.shtml'
http://www.godo.cc'/product/list/175.shtml'
http://www.godo.cc'/product/list/625.shtml'
http://www.godo.cc'/product/list/633.shtml'
http://www.godo.cc'/product/list/634.shtml'
http://www.godo.cc'/product/list/176.shtml'
http://www.godo.cc'/product/list/177.shtml'
http://www.godo.cc'/product/list/178.shtml'
http://www.godo.cc'/product/list/179.shtml'
http://www.godo.cc'/product/list/180.shtml'
http://www.godo.cc'/product/list/181.shtml'
http://www.godo.cc'/product/list/182.shtml'
http://www.godo.cc'/product/list/183.shtml'
http://www.godo.cc"/"
小弟就是想请教怎么去掉""符号和'符号啊 在线坐等大哥
s = s.replaceAll("\'","");