import java.io.*;
import java.net.*;
import org.htmlparser.beans.StringBean;
import org.htmlparser.util.ParserException;
public class Text {
public static void main(String[] args) throws IOException{
URL url = new URL("http://www.baidu.com");
InputStreamReader isr = new InputStreamReader(url.openStream());
BufferedReader br = new BufferedReader(isr);
String s;
while((s = br.readLine())!=null)
System.out.println(getText(s)); br.close();
}
public String getText(String url)throws ParserException{
StringBean sb = new StringBean();
try{
sb.setLinks(false);// 设置不需要得到页面所包含的链接信息
sb.setReplaceNonBreakingSpaces(true);// 设置将不间断空格由正规空格所替代
sb.setCollapse(true);// 设置将一序列空格由一个单一空格所代替
sb.setURL(url);//传入要解析的URL
}catch(ParserException e){
log.error(e); }
return sb.getStrings();
}
}
红色处报错
我的意图就是能够读取html网页的纯文本内容,求改错,谢谢
import java.net.*;
import org.htmlparser.beans.StringBean;
import org.htmlparser.util.ParserException;
public class Text {
public static void main(String[] args) throws IOException{
URL url = new URL("http://www.baidu.com");
InputStreamReader isr = new InputStreamReader(url.openStream());
BufferedReader br = new BufferedReader(isr);
String s;
while((s = br.readLine())!=null)
System.out.println(getText(s)); br.close();
}
public String getText(String url)throws ParserException{
StringBean sb = new StringBean();
try{
sb.setLinks(false);// 设置不需要得到页面所包含的链接信息
sb.setReplaceNonBreakingSpaces(true);// 设置将不间断空格由正规空格所替代
sb.setCollapse(true);// 设置将一序列空格由一个单一空格所代替
sb.setURL(url);//传入要解析的URL
}catch(ParserException e){
log.error(e); }
return sb.getStrings();
}
}
红色处报错
我的意图就是能够读取html网页的纯文本内容,求改错,谢谢
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.net.URL;
public class URLReader {
public static void main(String[] args)
{
try{
URL url = new URL("http://www.baidu.com");
File writeFil = new File("d:\\baidu.html");
BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
BufferedWriter out = new BufferedWriter(new FileWriter(writeFil));
String inputLine = null;
while((inputLine = in.readLine())!=null)
{
out.write(inputLine);
System.out.println(inputLine);
}
out.flush();
in.close();
out.close();
}
catch(Exception ex){
ex.printStackTrace();
}
}
}
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.net.URL;
public class URLReader {
public static void main(String[] args)
{
try{
URL url = new URL("http://www.baidu.com");
File writeFil = new File("d:\\baidu.html");
BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
BufferedWriter out = new BufferedWriter(new FileWriter(writeFil));
String inputLine = null;
while((inputLine = in.readLine())!=null)
{
out.write(inputLine);
System.out.println(inputLine);
}
out.flush();
in.close();
out.close();
}
catch(Exception ex){
ex.printStackTrace();
}
}
}
你没有理解我的意思,不要怕htm,要txt,我想要的是纯文本信息,不要html的标签信息
{
text = stripTags(clear(text));
text = text.replaceAll(" ", " ").replaceAll("[\r\n]", "<br/>").replaceAll("\\t", " ");
return text;
}
http://tiantian911.iteye.com/blog/184678
试了,不行
我给出了程序地址,但是调的时候老出错,那个log.error老报错
--------------------
http://www.qinglobo.com/qklist-6.htm
System.out.println(getText(s)); br.close();
}
---------------------------------
请问while语句有什么用?