跪求高手，好心人，哪位大哥帮忙看看，指导下，跪谢！！！着急！

跪求高手，哪位大哥帮忙看看，我改了很久就是找不到原因，这是关于词频统计的程序，为什么每次处理一个文件夹下面的文件时（.txt的），一个txt处理不完整，也就是只能处理一个txt的固定长度，后面的就处理不了了，我找了好久，都快蒙了，希望哪位高手帮忙看看，这样表述可能不太清楚，哪位好心人帮忙的话我可以发给你源程序你运行下试试，麻烦大哥给看看啊，真是非常感谢，很着急啊！！！！package df1;import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Collection {
//文本数组
public Document[] documents;
public int count=0;
//

public List<Term> termList = new LinkedList<Term>();
//停用词列表
  public ArrayList<String> stopList = new ArrayList<String>();
//无参构造函数
public Collection() { } //带参构造函数
public Collection(String[] docs){
setDocuments(docs);
}
//获取停用词列表
public void loadStopList() throws IOException {
BufferedReader stopListRead = new BufferedReader(new FileReader(
"E:\\stopwords.txt"));
Pattern p = Pattern.compile("[\u4e00-\u9fa5]+");
for (String str = ""; str != null; str = stopListRead.readLine()) {
if (str.length() == 0)
continue;//如果某一行的内容为空行，则扫描下一行,提高效率；
Matcher m = p.matcher(str);
while (m.find())
stopList.add(m.group());
}
} //计算文本及伪文档中单词的频数
public void process() throws Exception {
// 对停用词列表进行排序，以方便后面的二分查找过程 new Collection().loadStopList(); Arrays.sort(stopList.toArray()); //计算文本中单词的频数
for (int i = 0; i < documents.length; i++) {
documents[i].computeTerms();
}
} //设置并返回文本对象数组
public Document[] getDocuments() {
return documents;
} public void setDocuments(Document[] docs) {
this.documents = docs;
} //设置文本对象
public void setDocuments(String[] docs) {
documents = new Document[docs.length];
for (int i = 0; i < documents.length; i++) {
documents[i] = new Document();
documents[i].setContent(docs[i]);
}
} //计算IDF
public void computeDF() {

//获取termList，保证term的DF>=2
for (int i = 0; i < documents.length; i++) {
for (int j = 0; j < documents[i].getTerms().size(); j++) {
// 提取每文本的terms中的单词
Term t = (Term) (documents[i].getTerms().get(j));
int index = termList.indexOf(t);
Term newT;
if (index == -1) {//index=-1 则说明termList中不含有t
newT = new Term();
newT.setTerm(t.getTerm());
} else {// 有，读取该单词
newT = (Term) (termList.get(index));
}
//
newT.setTf(newT.getTf() + t.getTf());
newT.setDf(newT.getDf() + 1);
termList.add(newT);

}
removeDuplicateWithOrder(termList);
Collections.sort(termList,new Compare());

}
for(int m=0;m<termList.size();m++){
Term te=(Term) termList.get(m);
count+=te.getTf();
}
//遍历termList，计算IDF
for (int i = 0; i < termList.size(); i++) {
Term t = (Term) termList.get(i);
// IDF=(N+1)/df N is the total number of the documents
     t.setZTf(count);
t.setIdf(( (double) t.getDf()/documents.length) );
t.setTtf((double)t.getTf()/count);
t.setTdf(t.getIdf()+t.getTtf());
}

} //输出termList中的单词，及其属性信息（Tf Df Idf TfIdf）
public void printCollectionTermList() { for (int i = 0; i < termList.size(); i++) {
Term t = (Term) termList.get(i); System.out.print(t.getTerm() + "\t" + t.getTf() + "\t" + t.getDf()
+ "\t" +t.getTtf()+"\t"+ t.getIdf() + "\t"+t.getTdf()+"\t" + documents.length); System.out.println();
} } public void printTermTfIdf() {
printTermTfIdfOfDocuments(documents);
// printTermTfIdfOfDocuments(queryTerms);
} //输出所有文本中单词的TfIdf
public void printTermTfIdfOfDocuments(Document[] docs) {
for (int i = 0; i < docs.length; i++) {
for (int j = 0; j < docs[i].getTerms().size(); j++) {
Term t = (Term) (docs[i].getTerms().get(j));
// System.out.println(t.getTerm() + '\t' + t.getTfIdf());
}
} } //输出所有文本中term及其相应的Tf（出现的次数）
public void printDocuments() {
for (int i = 0; i < documents.length; i++) {
documents[i].printDocument();
}
}
public static void removeDuplicateWithOrder(List list) {
    Set set = new HashSet();
     List newList = new ArrayList();
   for (Iterator iter = list.iterator(); iter.hasNext();) {
         Object element = iter.next();
         if (set.add(element))
            newList.add(element);
      }
     list.clear();
     list.addAll(newList);
//     System.out.println( " remove duplicate " + list);
}
}
import java.util.ArrayList;
import java.util.StringTokenizer;
public class Document { //文本单词列表（不包含停用词）
private ArrayList<Term> terms = new ArrayList<Term>();
//文本内容
private String content; public String getContent() {
return content;
} public void setContent(String content) {
this.content = content;
} //查看term列表（所有单词）知否已经包含t
public boolean findTerm(Term t) {
if (terms.indexOf(t) == -1)
return false;
else
return true;
} //计算文本中term的频率
public void computeTerms() {
// 将文档内容全部转化为小写字母，并以\\W(非单词字符)分割后，存入tokens字符串数组 StringTokenizer st = new StringTokenizer(content," ");
String[] tokens = new String[st.countTokens()];
int   index1=0;
while(st.hasMoreTokens()){
tokens[index1]=st.nextToken();
index1++;
}
// 遍历tokens字符串数组
// System.out.println(tokens.length);
for (int i = 0; i < tokens.length; i++) {
if (!(tokens[i].equals("") || tokens[i].length() == 1))
//在stoplist中进行二分查找，返回索引<0,说明没有查找到，说明当前token不是停用词
//if (Arrays.binarySearch(Collection.stopList.toArray(),
//tokens[i]) == -1) {
if(!(new Collection()).stopList.contains(tokens[i])){
Term t = new Term();
t.setTerm(tokens[i]);
t.setTf(1); //获取单词t在terms中的索引，若不存在，索引值为-1
int index = terms.indexOf(t);
//若index<0,则说明term中不存在t，将t存入term列表
if (index == -1)
terms.add(t);
else {
//term已经存在，则相应地将其词频+1
Term tone = (terms.get(index));
tone.setTf(tone.getTf() + 1);
}
}
}

} //输出文本中term及其相应的Tf（出现的次数）
public void printDocument() {
for (int j = 0; j < terms.size(); j++) {
Term t = terms.get(j);
System.out.print(t.getTerm() + "(" + t.getTf() + ")" + "\t");
}
System.out.println();
} //设置并获取文本的term
public ArrayList<Term> getTerms() {
return terms;
} public void setTerms(ArrayList<Term> terms) {
this.terms = terms;
} }
这是我写的测试代码public class Test {
static String text=null;
public static void main(String[] args) throws Exception {
copyFolder("E:\\chx3","E:\\chx4");
}
public static  void  copyFolder(String  oldPath,  String  newPath)  {

       try  {
           (new  File(newPath)).mkdirs();  //如果文件夹不存在  则建立新文件夹
           File  a=new File(oldPath);
           String[]  file=a.list();
           long begin_time_count=System.currentTimeMillis();
           String line="";
           File  temp=null;
           for  (int  i  =  0;  i  <  file.length;  i++)  {
               if(oldPath.endsWith(File.separator)){
                   temp=new  File(oldPath+file[i]);
               }
               else{
                   temp=new  File(oldPath+File.separator+file[i]);
               }

               if(temp.isFile()){
                   BufferedReader  input  =  new  BufferedReader(new FileReader(temp));
                   BufferedWriter output=new BufferedWriter(new FileWriter(newPath  +  "/"  +
                           (temp.getName()).toString()));
                String[] string=new String[input.toString().length()];
         for(int j=0;j<string.length;j++){
           String s = input.readLine();
                      string[j] = s;
         }

         Collection allDocs = new Collection(string);
         allDocs.process();
         allDocs.computeDF();
         System.out.println("打印CollectionTermList：");
         for (int m = 0; m < allDocs.termList.size(); m++) {
         Term t = (Term) allDocs.termList.get(m);
         output.write(t.getTerm() + "  " +"TF："+ t.getTf() + "  " +"DF:"+ t.getDf()
         +"  "+"DDF:"+ t.getIdf() + "  "+"TTF:"+t.getTtf()+"  "+"TDF:"+t.getTdf()+"  "+"ZTF:"+ t.getZTf()+"  "+"ZDF:"+allDocs.documents.length);
                     output.newLine();
                     output.flush();
         }

                   output.close();
                   input.close();

               }
                   long end_time_count=System.currentTimeMillis();
           System.out.println("用时（毫秒）："+(end_time_count-begin_time_count));                if(temp.isDirectory()){//如果是子文件夹
                   copyFolder(oldPath+"/"+file[i],newPath+"/"+file[i]);
               }
           }
       }
       catch  (Exception  e)  {
           System.out.println("操作出错");
           e.printStackTrace();

       }

   }
}

解决方案 »

免费领取超大流量手机卡，每月29元包185G流量+100分钟通话, 中国电信官方发货

JAVA这个我不太懂，哥们祝你好运！
我找到原因了，是因为一篇文章没完全读到字符串数组中，怎么样把下面的1.txt完全读出来？请高手指导
我刚试了下，只能输出一半，是不是跟循环有关？ BufferedReader  input  =  new  BufferedReader(new FileReader("E:\\1.txt"));

     String[] string=new String[input.toString().length()];
for(int j=0;j<string.length;j++){
   String s = input.readLine();
           string[j] = s;
           System.out.println(s);//输出结果不完整？？？
}
这样循环遍历！ BufferedReader  input  =  new  BufferedReader(new FileReader("E:\\1.txt"));
        String str=null;
        while((str=input.readLine())!=null){
           System.out.println(str);
        }
谢谢你
这样的循环我知道，但是要把这个txt文件完全放到数组中，然后再读出来用，这是上面程序要用的，大哥知道怎么改吗？
我是这样改的，但是输出后有几个null，不知道为什么？高手指导下，谢谢 String line=null;
BufferedReader  input  =  new  BufferedReader(new FileReader("E:\\1.txt"));
     String[] string=new String[input.toString().length()];
     while ((line = input.readLine()) != null){
for(int j=0;j<string.length;j++){
String s = input.readLine();
string[j] = s;
           System.out.println(s);
}
     }
readLine() 读一行 while中读了2次
就放到一个数组中么？BufferedReader  input  =  new  BufferedReader(new FileReader("E:\\1.txt"));
        String str=null;
        List<String> list=new ArrayList<String>();
        while((str=input.readLine())!=null){
         list.add(str);
        }
        String[] array=list.toArray(new String[0]);
        String[] string = new String[input.toString().length()];
int i=0;
while((s=input.read())!=-1){
Character c= new Character((char)s);
string[i] = c.toString();
i++;
}