import java.io.BufferedReader;
import java.io.FileReader;
import java.io.FilterReader;
import java.io.IOException;
import java.io.Reader;public class RemoveHTMLReader extends FilterReader{
public RemoveHTMLReader(Reader in){
super(in);
}
boolean intag = false; //标注是否有标签
public int read(char[] buf, int off, int len) throws IOException{
int numchars = 0;
while(numchars == 0){
numchars = in.read(buf, off ,len);
if(numchars == -1)
return -1;
int last = off;
for(int i = off; i < off + numchars; i++){
if(!intag){
if(buf[i] == '<')
intag = true;
else
buf[last++] = buf[i];
}
else{
if(buf[i] == '>')
intag = false;
}
}
numchars = last - off;
}
return numchars;
}
public int read() throws IOException{
char buf[] = new char[1];
int result = in.read(buf, 0, 1);
if(result == -1)
return -1;
return (int)buf[0];
}
public static class Test{
public static void main(String args[]){
try{
if(args.length != 1)
throw new IllegalArgumentException("Wrong number of args");
BufferedReader in = new BufferedReader(new RemoveHTMLReader(new FileReader(args[0])));
String line;
while((line = in.readLine()) != null)
System.out.println(line);
in.close();
}catch(Exception e){
System.err.println(e);
}
}
}
}
这个程序就是要实现打印出去掉html标记后的文件内容,这里的RemoveHTMLReader覆盖了抽象类的FilterReader的2个read方法,完后我有一点不明白,就是什么时候调用了这2个read方法?
if(!intag){
if(buf[i] == '<')
intag = true;
else
buf[last++] = buf[i];
}
else{
if(buf[i] == '>')
intag = false;
}
}这段就是判断如果不是Html标记,就赋值,是Html标记,就跳过,但我觉得这样赋值完后,可能还会残留html标记啊,比如说,文件中的内容为:
<html>aa</html>
那么执行完后,buf里的内容就成为了atml</html>(假设read参数off为0,而len为文件中内容的长度),不知我理解的是否有问题
if(!intag){
if(buf[i] == '<')
intag = true;
else
buf[last++] = buf[i];
}
else{
if(buf[i] == '>')
intag = false;
}
public String readLine() throws IOException {
return readLine(false);
}
上面是BufferedReader中的readline方法
跳转到以下方法String readLine(boolean ignoreLF) throws IOException {
StringBuffer s = null;
int startChar; synchronized (lock) {
ensureOpen();
boolean omitLF = ignoreLF || skipLF; bufferLoop:
for (;;) { if (nextChar >= nChars)
fill();
if (nextChar >= nChars) { /* EOF */
if (s != null && s.length() > 0)
return s.toString();
else
return null;
}
boolean eol = false;
char c = 0;
int i; /* Skip a leftover '\n', if necessary */
if (omitLF && (cb[nextChar] == '\n'))
nextChar++;
skipLF = false;
omitLF = false; charLoop:
for (i = nextChar; i < nChars; i++) {
c = cb[i];
if ((c == '\n') || (c == '\r')) {
eol = true;
break charLoop;
}
} startChar = nextChar;
nextChar = i; if (eol) {
String str;
if (s == null) {
str = new String(cb, startChar, i - startChar);
} else {
s.append(cb, startChar, i - startChar);
str = s.toString();
}
nextChar++;
if (c == '\r') {
skipLF = true;
}
return str;
}
if (s == null)
s = new StringBuffer(defaultExpectedLineLength);
s.append(cb, startChar, i - startChar);
}
}
}fill方法如下 private void fill() throws IOException {
int dst;
if (edChar <= UNMARKED) {
/* No */
dst = 0;
} else {
/* Marked */
int delta = nextChar - edChar;
if (delta >= readAheadLimit) {
/* Gone past read-ahead limit: Invalidate */
edChar = INVALIDATED;
readAheadLimit = 0;
dst = 0;
} else {
if (readAheadLimit <= cb.length) {
/* Shuffle in the current buffer */
System.arraycopy(cb, edChar, cb, 0, delta);
edChar = 0;
dst = delta;
} else {
/* Reallocate buffer to accommodate read-ahead limit */
char ncb[] = new char[readAheadLimit];
System.arraycopy(cb, edChar, ncb, 0, delta);
cb = ncb;
edChar = 0;
dst = delta;
}
nextChar = nChars = delta;
}
} int n;
do {
n = in.read(cb, dst, cb.length - dst);
} while (n == 0);
if (n > 0) {
nChars = dst + n;
nextChar = dst;
}
}上面in.read(cb,dst,cb.length-dst)就是你复写的那方法了
至于另外一个方法没有调用到
<html>aa</html>
那么执行完后,buf里的内容就成为了atml</html>(假设read参数off为0,而len为文件中内容的长度),不知我理解的是否有问程序应该没什么问题啊,建议楼主debug下