关于截取数据

本人一段代码:
import java.io.*;
import java.net.*;public class Test5{
  static String get_from_internet(String href) throws Exception{
    URL url=new URL(href);
    URLConnection url_con = url.openConnection();
    return get_content(url_con.getInputStream());
  }
  static String get_from_file(String path) throws Exception{
    return get_content(new FileInputStream(path));
    }
  static void write_to_file(String path,String content)throws Exception{
    FileOutputStream output=new FileOutputStream(path);
    output.write(content.getBytes());
    output.close();
    }  public static void main(String[] args) throws Exception{
    String str = get_from_internet("http://news.sina.com.cn/china/");
    String start_tag = "\t<table width=320 cellspacing=0 style='margin:7px 0 7px 0'>\r\n\t<tr><td class='linkBlue f14 lh22'>";
    int start = str.indexOf(start_tag) + start_tag.length();
    int end = str.indexOf("</td>", start);
    str = str.substring(start, end);
    start = 0;
    String tag = "·<a href=";
    int file_count = 0;
    while((start=str.indexOf(tag, start))!=-1){
      start += tag.length();
      String href = str.substring(start, str.indexOf("target=_blank>", start));
      start = str.indexOf(">", start) + 1;
      String title = str.substring(start, str.indexOf("</a>", start));      String content = get_from_internet(href);      String t_t = "<font id=\"zoom\" class=f14>";
      int t_s = content.indexOf(t_t) + t_t.length();
      content = content.substring(t_s, content.indexOf("<br clear=all>\r\n\t</td></tr>", t_s));
      t_t = "</table>";
      t_s = content.indexOf("");
      int t_e = content.lastIndexOf(t_t) + t_t.length();
      content = content.substring(0, t_s) + content.substring(t_e);      content = "<html><head><title>" + title + "</title></head><body>\r\n<h1>" + title + "</h1>\r\n" + content + "</body></html>";
      file_count++;
      write_to_file(file_count + ".htm", content);
    }
  }
  static String get_content(InputStream input) throws Exception{
    int i = 0; byte[] b = new byte[1024];
    ByteArrayOutputStream bout = new ByteArrayOutputStream();
    while((i=input.read(b))!=-1) bout.write(b, 0, i);
    input.close();
    return bout.toString();
  }
}
是截取sina国内数据的代码
我还多地方不明白
麻烦把我讲解一下~~~~~~~~~~

解决方案 »

免费领取超大流量手机卡，每月29元包185G流量+100分钟通话, 中国电信官方发货

package cn.edu.biti.net;import java.io.*;
import java.net.*;public class Test5{
/**
* 取得URL连接
* @param href
* @return
* @throws Exception
*/
  static String get_from_internet(String href) throws Exception{
    URL url=new URL(href);
    URLConnection url_con = url.openConnection();
    return get_content(url_con.getInputStream());
  }
  /**
   * 返回获得信息
   * @param path
   * @return
   * @throws Exception
   */
  static String get_from_file(String path) throws Exception{
    return get_content(new FileInputStream(path));
    }
  /**
   * 把信息写入某文件
   * @param path
   * @param content
   * @throws Exception
   */
  static void write_to_file(String path,String content)throws Exception{
    FileOutputStream output=new FileOutputStream(path);
    output.write(content.getBytes());
    output.close();
    }  public static void main(String[] args) throws Exception{
    String str = get_from_internet("http://news.sina.com.cn/china/");
    String start_tag = "\t<table width=320 cellspacing=0 style='margin:7px 0 7px 0'>\r\n\t<tr><td class='linkBlue f14 lh22'>";
    int start = str.indexOf(start_tag) + start_tag.length();//1
    int end = str.indexOf("</td>", start);//2
    str = str.substring(start, end);//3,取得子串,获得有用信息
    start = 0;//1
    String tag = "·<a href=";//2
    int file_count = 0;//3重置参数
    //
    while((start=str.indexOf(tag, start))!=-1){
      start += tag.length();
      String href = str.substring(start, str.indexOf("target=_blank>", start));
      start = str.indexOf(">", start) + 1;
      String title = str.substring(start, str.indexOf("</a>", start));      String content = get_from_internet(href);      String t_t = "<font id=\"zoom\" class=f14>";
      int t_s = content.indexOf(t_t) + t_t.length();
      content = content.substring(t_s, content.indexOf("<br clear=all>\r\n\t</td></tr>", t_s));
      t_t = "</table>";
      t_s = content.indexOf("");
      int t_e = content.lastIndexOf(t_t) + t_t.length();
      content = content.substring(0, t_s) + content.substring(t_e);      content = "<html><head><title>" + title + "</title></head><body>\r\n<h1>" + title + "</h1>\r\n" + content + "</body></html>";
      file_count++;
      write_to_file(file_count + ".htm", content);
    }
  }  /**
   * 把字节流转换成字符串,这个是公共的
   * @param input
   * @return
   * @throws Exception
   */
  static String get_content(InputStream input) throws Exception{
    int i = 0; byte[] b = new byte[1024];
    ByteArrayOutputStream bout = new ByteArrayOutputStream();
    while((i=input.read(b))!=-1) bout.write(b, 0, i);
    input.close();
    return bout.toString();
  }
}还有哪不明白,大家再来说
建议楼主用HttpClient去获得Html 然后用NekoHTML解析一下就如同XML一样了相关资料自己google一下吧