如何实现网站数据分类抓取？

用 fckeditor, jericho
就完事了

建议lz去问问java版主老紫竹。

比如：在我的表单中输入：http://www.sina.com.cn，选择－》体育，然后点抓取按钮，就能抓取到抓取到 sina 的所有体育新闻...

http://topic.csdn.net/u/20080511/11/4fdaa9ea-a892-4a05-ae6c-5fb617ed3a46.html

请问：
public void getContent(){
        DataInputStream dis=null;
        InputStream is=null;
        try{
           Sring url="http://tech.163.com/digi/08/0509/12/4BGKIOFQ001618J1.html";
                StringBuffer strBuffer=new StringBuffer();
                URL rss2=new URL(url);
                URLConnection con2=(URLConnection)rss2.openConnection();
                DataInputStream dis2=new DataInputStream(con2.getInputStream());
                String line2 , content="" , filter="";
                while((line2=dis2.readLine())!=null){
                                       filter=new String(line2.trim().getBytes("iso8859-1"),"GB2312");
                    strBuffer.append(filter+"\r\n");
                }
                content=strBuffer.toString();                                System.out.println(content);
                dis2.close();
        }
        catch(Exception e){
            e.printStackTrace();
        }
        finally{
            try{
                if(dis!=null)dis.close();
            }
            catch(Exception e){}
        }
    }怎么用？谢谢！！1

修改url地址,指向你所要获取的网页
楼主，我已经实现抓取网易的最新的体育新闻
QQ 617736906

public List getNewsContent(RssVO vo,int newsType){
DataInputStream dis=null;
InputStream is=null;
List lt_content=new ArrayList();
try{
System.out.println("[新闻文本]");
String url="";
if(newsType==0)
url="http://news.163.com/special/00011K6L/rss_gn.xml"; //国内
else if(newsType==1)
url="http://news.163.com/special/00011K6L/rss_gj.xml"; //国际
else if(newsType==2)
url="http://news.163.com/special/00011K6L/rss_sh.xml"; //社会新闻
else if(newsType==3)
url="http://sports.163.com/special/00051K7F/rss_sportszh.xml"; //体育
else if(newsType==4)
url="http://news.163.com/special/00011K6L/rss_newstop.xml"; //头条
else if(newsType==5)
url="http://ent.163.com/special/00031K7Q/rss_entstar.xml"; //娱乐
URL rss=new URL(url);
URLConnection con=(URLConnection)rss.openConnection();
con.setConnectTimeout(30000); //连接超时
con.setReadTimeout(30000); //读操作超时
dis=new DataInputStream(con.getInputStream());
String line;
StringBuffer buffer=new StringBuffer();
while((line=dis.readLine())!=null){
line=new String(line.trim().getBytes("iso8859-1"),"GB2312");
buffer.append(line+"\n");
}
String newsXml=buffer.toString();
dis.close();
byte[]b=newsXml.getBytes();
is=new ByteArrayInputStream(b);
DocumentBuilderFactory factory=DocumentBuilderFactory.newInstance();
DocumentBuilder builder=factory.newDocumentBuilder();
Document doc=builder.parse(is);
NodeList list=doc.getElementsByTagName("item");
int num=0;
if(list.getLength()>=10)
num=5;
else
num=list.getLength();

String arrayTitle[] = new String [num];
String arrayLink[] =new String[num];
for(int i=0;i<num;i++){
Element node=(Element)list.item(i);
String title=node.getElementsByTagName("title")
.item(0).getFirstChild().getNodeValue();
String link=node.getElementsByTagName("link")
.item(0).getFirstChild().getNodeValue();
int n=title.indexOf("(图)");
int n1=title.indexOf("(组图)");
if(n!=-1)
title=title.substring(0,n);
if(n1!=-1)
title=title.substring(0,n1);
arrayTitle[i]=title;
arrayLink[i]=link;
}
//二次获取新闻内容
RssFunction rp=new RssFunction();
for(int j=0;j<arrayLink.length;j++){
StringBuffer strBuffer=new StringBuffer();
URL rss2=new URL(arrayLink[j]);
URLConnection con2=(URLConnection)rss2.openConnection();
DataInputStream dis2=new DataInputStream(con2.getInputStream());
String line2 , content="" , filter="";
while((line2=dis2.readLine())!=null){
line2=new String(line2.trim().getBytes("iso8859-1"),"GB2312");
int n=line2.indexOf("<P style=\"TEXT-INDENT: 2em\">");
if(n!=-1){
filter=rp.filterHtml(line2); //过滤html标签
if(filter.length()>3){
filter=filter.replaceAll("\""," ");
strBuffer.append("<img width=\"3\" src=\"\"/>"+filter+"<br/>");
}
}
}
content=strBuffer.toString();
System.out.println("["+arrayTitle[j]+"] ");
RssVO vo2=new RssVO();
vo2.setBg_color(vo.getBg_color());
vo2.setClient_id(vo.getClient_id());
vo2.setCode(vo2.getCode());
vo2.setFg_color(vo.getFg_color());
vo2.setFont_size(vo.getFont_size());
vo2.setFresh(vo.getFresh());
vo2.setGap(vo.getGap());
vo2.setName(arrayTitle[j].replaceAll("\""," "));
vo2.setSpeed(vo.getSpeed());
vo2.setType(5);
vo2.setValue(content);
if(content.length()!=0)
lt_content.add(vo2);
dis2.close();
}
}
catch(Exception e){
}
finally{
try{
if(dis!=null)dis.close();
}
catch(Exception e){}
}
return lt_content;
}

这个简单啊，不就是一个URL处理的问题吗？其他的抓取与你平常的抓取没区别吧
只是根据你输入的类别去选择相应的URL地址传递过去抓取就可以了啊

请教：danier_skyRssVO vo,int newsType 怎么赋值？

RssVO 是我自己写的一个javaBean ，你可以去掉
newsType是一个int参数（0－－6），是你想要获取哪种类型的新闻

import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
//这个是RssFunction 调用的函数
public String filterHtml(String htmlStr) {
String textStr ="";
java.util.regex.Pattern p_script;
java.util.regex.Matcher m_script;
java.util.regex.Pattern p_style;
java.util.regex.Matcher m_style;
java.util.regex.Pattern p_html;
java.util.regex.Matcher m_html;
try {
       String regEx_script = "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>"; //定义script的正则表达式{或<script[^>]*?>[\\s\\S]*?<\\/script> }
       String regEx_style = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>"; //定义style的正则表达式{或<style[^>]*?>[\\s\\S]*?<\\/style> }
       String regEx_html = "<[^>]+>"; //定义HTML标签的正则表达式

          p_script = Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE);
          m_script = p_script.matcher(htmlStr);
          htmlStr = m_script.replaceAll(""); //过滤script标签          p_style = Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE);
          m_style = p_style.matcher(htmlStr);
          htmlStr = m_style.replaceAll(""); //过滤style标签

          p_html = Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE);
          m_html = p_html.matcher(htmlStr);
          htmlStr = m_html.replaceAll(" "); //过滤html标签

       textStr = htmlStr.replaceAll(" ","");

      }catch(Exception e) {
       e.printStackTrace();
      }
      return textStr;//返回文本字符串
   }
你必须导入htmlparser.jar包，去网上下一个

执行之后就是：
[新闻文本]
[Fatal Error] :17:265: The element type "title" must be terminated by the matching end-tag "</title>".

不就是一个URL处理的问题吗？其他的抓取与你平常的抓取没区别吧
只是根据你输入的类别去选择相应的URL地址传递过去抓取就可以了啊

用csdn的聊天工具吧
那样方便些

调试易

如何实现网站数据分类抓取？

解决方案 »