跪求高手,哪位大哥帮忙看看,我改了很久就是找不到原因,这是关于词频统计的程序,为什么每次处理一个文件夹下面的文件时(.txt的),一个txt处理不完整,也就是只能处理一个txt的固定长度,后面的就处理不了了,我找了好久,都快蒙了,希望哪位高手帮忙看看,这样表述可能不太清楚,哪位好心人帮忙的话我可以发给你源程序你运行下试试,麻烦大哥给看看啊,真是非常感谢,很着急啊!!!!package df1;import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Collection {
//文本数组
public Document[] documents;
public int count=0;
//
public List<Term> termList = new LinkedList<Term>();
//停用词列表
public ArrayList<String> stopList = new ArrayList<String>();
//无参构造函数
public Collection() { } //带参构造函数
public Collection(String[] docs){
setDocuments(docs);
}
//获取停用词列表
public void loadStopList() throws IOException {
BufferedReader stopListRead = new BufferedReader(new FileReader(
"E:\\stopwords.txt"));
Pattern p = Pattern.compile("[\u4e00-\u9fa5]+");
for (String str = ""; str != null; str = stopListRead.readLine()) {
if (str.length() == 0)
continue;//如果某一行的内容为空行,则扫描下一行,提高效率;
Matcher m = p.matcher(str);
while (m.find())
stopList.add(m.group());
}
} //计算文本及伪文档中单词的频数
public void process() throws Exception {
// 对停用词列表进行排序,以方便后面的二分查找过程 new Collection().loadStopList(); Arrays.sort(stopList.toArray()); //计算文本中单词的频数
for (int i = 0; i < documents.length; i++) {
documents[i].computeTerms();
}
} //设置并返回文本对象数组
public Document[] getDocuments() {
return documents;
} public void setDocuments(Document[] docs) {
this.documents = docs;
} //设置文本对象
public void setDocuments(String[] docs) {
documents = new Document[docs.length];
for (int i = 0; i < documents.length; i++) {
documents[i] = new Document();
documents[i].setContent(docs[i]);
}
} //计算IDF
public void computeDF() {
//获取termList,保证term的DF>=2
for (int i = 0; i < documents.length; i++) {
for (int j = 0; j < documents[i].getTerms().size(); j++) {
// 提取每文本的terms中的单词
Term t = (Term) (documents[i].getTerms().get(j));
int index = termList.indexOf(t);
Term newT;
if (index == -1) {//index=-1 则说明termList中不含有t
newT = new Term();
newT.setTerm(t.getTerm());
} else {// 有,读取该单词
newT = (Term) (termList.get(index));
}
//
newT.setTf(newT.getTf() + t.getTf());
newT.setDf(newT.getDf() + 1);
termList.add(newT);
}
removeDuplicateWithOrder(termList);
Collections.sort(termList,new Compare());
}
for(int m=0;m<termList.size();m++){
Term te=(Term) termList.get(m);
count+=te.getTf();
}
//遍历termList,计算IDF
for (int i = 0; i < termList.size(); i++) {
Term t = (Term) termList.get(i);
// IDF=(N+1)/df N is the total number of the documents
t.setZTf(count);
t.setIdf(( (double) t.getDf()/documents.length) );
t.setTtf((double)t.getTf()/count);
t.setTdf(t.getIdf()+t.getTtf());
}
} //输出termList中的单词,及其属性信息(Tf Df Idf TfIdf)
public void printCollectionTermList() { for (int i = 0; i < termList.size(); i++) {
Term t = (Term) termList.get(i); System.out.print(t.getTerm() + "\t" + t.getTf() + "\t" + t.getDf()
+ "\t" +t.getTtf()+"\t"+ t.getIdf() + "\t"+t.getTdf()+"\t" + documents.length); System.out.println();
} } public void printTermTfIdf() {
printTermTfIdfOfDocuments(documents);
// printTermTfIdfOfDocuments(queryTerms);
} //输出所有文本中单词的TfIdf
public void printTermTfIdfOfDocuments(Document[] docs) {
for (int i = 0; i < docs.length; i++) {
for (int j = 0; j < docs[i].getTerms().size(); j++) {
Term t = (Term) (docs[i].getTerms().get(j));
// System.out.println(t.getTerm() + '\t' + t.getTfIdf());
}
} } //输出所有文本中term及其相应的Tf(出现的次数)
public void printDocuments() {
for (int i = 0; i < documents.length; i++) {
documents[i].printDocument();
}
}
public static void removeDuplicateWithOrder(List list) {
Set set = new HashSet();
List newList = new ArrayList();
for (Iterator iter = list.iterator(); iter.hasNext();) {
Object element = iter.next();
if (set.add(element))
newList.add(element);
}
list.clear();
list.addAll(newList);
// System.out.println( " remove duplicate " + list);
}
}
import java.util.ArrayList;
import java.util.StringTokenizer;
public class Document { //文本单词列表(不包含停用词)
private ArrayList<Term> terms = new ArrayList<Term>();
//文本内容
private String content; public String getContent() {
return content;
} public void setContent(String content) {
this.content = content;
} //查看term列表(所有单词)知否已经包含t
public boolean findTerm(Term t) {
if (terms.indexOf(t) == -1)
return false;
else
return true;
} //计算文本中term的频率
public void computeTerms() {
// 将文档内容全部转化为小写字母,并以\\W(非单词字符)分割后,存入tokens字符串数组 StringTokenizer st = new StringTokenizer(content," ");
String[] tokens = new String[st.countTokens()];
int index1=0;
while(st.hasMoreTokens()){
tokens[index1]=st.nextToken();
index1++;
}
// 遍历tokens字符串数组
// System.out.println(tokens.length);
for (int i = 0; i < tokens.length; i++) {
if (!(tokens[i].equals("") || tokens[i].length() == 1))
//在stoplist中进行二分查找,返回索引<0,说明没有查找到,说明当前token不是停用词
//if (Arrays.binarySearch(Collection.stopList.toArray(),
//tokens[i]) == -1) {
if(!(new Collection()).stopList.contains(tokens[i])){
Term t = new Term();
t.setTerm(tokens[i]);
t.setTf(1); //获取单词t在terms中的索引,若不存在,索引值为-1
int index = terms.indexOf(t);
//若index<0,则说明term中不存在t,将t存入term列表
if (index == -1)
terms.add(t);
else {
//term已经存在,则相应地将其词频+1
Term tone = (terms.get(index));
tone.setTf(tone.getTf() + 1);
}
}
}
} //输出文本中term及其相应的Tf(出现的次数)
public void printDocument() {
for (int j = 0; j < terms.size(); j++) {
Term t = terms.get(j);
System.out.print(t.getTerm() + "(" + t.getTf() + ")" + "\t");
}
System.out.println();
} //设置并获取文本的term
public ArrayList<Term> getTerms() {
return terms;
} public void setTerms(ArrayList<Term> terms) {
this.terms = terms;
} }
这是我写的测试代码public class Test {
static String text=null;
public static void main(String[] args) throws Exception {
copyFolder("E:\\chx3","E:\\chx4");
}
public static void copyFolder(String oldPath, String newPath) {
try {
(new File(newPath)).mkdirs(); //如果文件夹不存在 则建立新文件夹
File a=new File(oldPath);
String[] file=a.list();
long begin_time_count=System.currentTimeMillis();
String line="";
File temp=null;
for (int i = 0; i < file.length; i++) {
if(oldPath.endsWith(File.separator)){
temp=new File(oldPath+file[i]);
}
else{
temp=new File(oldPath+File.separator+file[i]);
}
if(temp.isFile()){
BufferedReader input = new BufferedReader(new FileReader(temp));
BufferedWriter output=new BufferedWriter(new FileWriter(newPath + "/" +
(temp.getName()).toString()));
String[] string=new String[input.toString().length()];
for(int j=0;j<string.length;j++){
String s = input.readLine();
string[j] = s;
}
Collection allDocs = new Collection(string);
allDocs.process();
allDocs.computeDF();
System.out.println("打印CollectionTermList:");
for (int m = 0; m < allDocs.termList.size(); m++) {
Term t = (Term) allDocs.termList.get(m);
output.write(t.getTerm() + " " +"TF:"+ t.getTf() + " " +"DF:"+ t.getDf()
+" "+"DDF:"+ t.getIdf() + " "+"TTF:"+t.getTtf()+" "+"TDF:"+t.getTdf()+" "+"ZTF:"+ t.getZTf()+" "+"ZDF:"+allDocs.documents.length);
output.newLine();
output.flush();
}
output.close();
input.close();
}
long end_time_count=System.currentTimeMillis();
System.out.println("用时(毫秒):"+(end_time_count-begin_time_count)); if(temp.isDirectory()){//如果是子文件夹
copyFolder(oldPath+"/"+file[i],newPath+"/"+file[i]);
}
}
}
catch (Exception e) {
System.out.println("操作出错");
e.printStackTrace();
}
}
}
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Collection {
//文本数组
public Document[] documents;
public int count=0;
//
public List<Term> termList = new LinkedList<Term>();
//停用词列表
public ArrayList<String> stopList = new ArrayList<String>();
//无参构造函数
public Collection() { } //带参构造函数
public Collection(String[] docs){
setDocuments(docs);
}
//获取停用词列表
public void loadStopList() throws IOException {
BufferedReader stopListRead = new BufferedReader(new FileReader(
"E:\\stopwords.txt"));
Pattern p = Pattern.compile("[\u4e00-\u9fa5]+");
for (String str = ""; str != null; str = stopListRead.readLine()) {
if (str.length() == 0)
continue;//如果某一行的内容为空行,则扫描下一行,提高效率;
Matcher m = p.matcher(str);
while (m.find())
stopList.add(m.group());
}
} //计算文本及伪文档中单词的频数
public void process() throws Exception {
// 对停用词列表进行排序,以方便后面的二分查找过程 new Collection().loadStopList(); Arrays.sort(stopList.toArray()); //计算文本中单词的频数
for (int i = 0; i < documents.length; i++) {
documents[i].computeTerms();
}
} //设置并返回文本对象数组
public Document[] getDocuments() {
return documents;
} public void setDocuments(Document[] docs) {
this.documents = docs;
} //设置文本对象
public void setDocuments(String[] docs) {
documents = new Document[docs.length];
for (int i = 0; i < documents.length; i++) {
documents[i] = new Document();
documents[i].setContent(docs[i]);
}
} //计算IDF
public void computeDF() {
//获取termList,保证term的DF>=2
for (int i = 0; i < documents.length; i++) {
for (int j = 0; j < documents[i].getTerms().size(); j++) {
// 提取每文本的terms中的单词
Term t = (Term) (documents[i].getTerms().get(j));
int index = termList.indexOf(t);
Term newT;
if (index == -1) {//index=-1 则说明termList中不含有t
newT = new Term();
newT.setTerm(t.getTerm());
} else {// 有,读取该单词
newT = (Term) (termList.get(index));
}
//
newT.setTf(newT.getTf() + t.getTf());
newT.setDf(newT.getDf() + 1);
termList.add(newT);
}
removeDuplicateWithOrder(termList);
Collections.sort(termList,new Compare());
}
for(int m=0;m<termList.size();m++){
Term te=(Term) termList.get(m);
count+=te.getTf();
}
//遍历termList,计算IDF
for (int i = 0; i < termList.size(); i++) {
Term t = (Term) termList.get(i);
// IDF=(N+1)/df N is the total number of the documents
t.setZTf(count);
t.setIdf(( (double) t.getDf()/documents.length) );
t.setTtf((double)t.getTf()/count);
t.setTdf(t.getIdf()+t.getTtf());
}
} //输出termList中的单词,及其属性信息(Tf Df Idf TfIdf)
public void printCollectionTermList() { for (int i = 0; i < termList.size(); i++) {
Term t = (Term) termList.get(i); System.out.print(t.getTerm() + "\t" + t.getTf() + "\t" + t.getDf()
+ "\t" +t.getTtf()+"\t"+ t.getIdf() + "\t"+t.getTdf()+"\t" + documents.length); System.out.println();
} } public void printTermTfIdf() {
printTermTfIdfOfDocuments(documents);
// printTermTfIdfOfDocuments(queryTerms);
} //输出所有文本中单词的TfIdf
public void printTermTfIdfOfDocuments(Document[] docs) {
for (int i = 0; i < docs.length; i++) {
for (int j = 0; j < docs[i].getTerms().size(); j++) {
Term t = (Term) (docs[i].getTerms().get(j));
// System.out.println(t.getTerm() + '\t' + t.getTfIdf());
}
} } //输出所有文本中term及其相应的Tf(出现的次数)
public void printDocuments() {
for (int i = 0; i < documents.length; i++) {
documents[i].printDocument();
}
}
public static void removeDuplicateWithOrder(List list) {
Set set = new HashSet();
List newList = new ArrayList();
for (Iterator iter = list.iterator(); iter.hasNext();) {
Object element = iter.next();
if (set.add(element))
newList.add(element);
}
list.clear();
list.addAll(newList);
// System.out.println( " remove duplicate " + list);
}
}
import java.util.ArrayList;
import java.util.StringTokenizer;
public class Document { //文本单词列表(不包含停用词)
private ArrayList<Term> terms = new ArrayList<Term>();
//文本内容
private String content; public String getContent() {
return content;
} public void setContent(String content) {
this.content = content;
} //查看term列表(所有单词)知否已经包含t
public boolean findTerm(Term t) {
if (terms.indexOf(t) == -1)
return false;
else
return true;
} //计算文本中term的频率
public void computeTerms() {
// 将文档内容全部转化为小写字母,并以\\W(非单词字符)分割后,存入tokens字符串数组 StringTokenizer st = new StringTokenizer(content," ");
String[] tokens = new String[st.countTokens()];
int index1=0;
while(st.hasMoreTokens()){
tokens[index1]=st.nextToken();
index1++;
}
// 遍历tokens字符串数组
// System.out.println(tokens.length);
for (int i = 0; i < tokens.length; i++) {
if (!(tokens[i].equals("") || tokens[i].length() == 1))
//在stoplist中进行二分查找,返回索引<0,说明没有查找到,说明当前token不是停用词
//if (Arrays.binarySearch(Collection.stopList.toArray(),
//tokens[i]) == -1) {
if(!(new Collection()).stopList.contains(tokens[i])){
Term t = new Term();
t.setTerm(tokens[i]);
t.setTf(1); //获取单词t在terms中的索引,若不存在,索引值为-1
int index = terms.indexOf(t);
//若index<0,则说明term中不存在t,将t存入term列表
if (index == -1)
terms.add(t);
else {
//term已经存在,则相应地将其词频+1
Term tone = (terms.get(index));
tone.setTf(tone.getTf() + 1);
}
}
}
} //输出文本中term及其相应的Tf(出现的次数)
public void printDocument() {
for (int j = 0; j < terms.size(); j++) {
Term t = terms.get(j);
System.out.print(t.getTerm() + "(" + t.getTf() + ")" + "\t");
}
System.out.println();
} //设置并获取文本的term
public ArrayList<Term> getTerms() {
return terms;
} public void setTerms(ArrayList<Term> terms) {
this.terms = terms;
} }
这是我写的测试代码public class Test {
static String text=null;
public static void main(String[] args) throws Exception {
copyFolder("E:\\chx3","E:\\chx4");
}
public static void copyFolder(String oldPath, String newPath) {
try {
(new File(newPath)).mkdirs(); //如果文件夹不存在 则建立新文件夹
File a=new File(oldPath);
String[] file=a.list();
long begin_time_count=System.currentTimeMillis();
String line="";
File temp=null;
for (int i = 0; i < file.length; i++) {
if(oldPath.endsWith(File.separator)){
temp=new File(oldPath+file[i]);
}
else{
temp=new File(oldPath+File.separator+file[i]);
}
if(temp.isFile()){
BufferedReader input = new BufferedReader(new FileReader(temp));
BufferedWriter output=new BufferedWriter(new FileWriter(newPath + "/" +
(temp.getName()).toString()));
String[] string=new String[input.toString().length()];
for(int j=0;j<string.length;j++){
String s = input.readLine();
string[j] = s;
}
Collection allDocs = new Collection(string);
allDocs.process();
allDocs.computeDF();
System.out.println("打印CollectionTermList:");
for (int m = 0; m < allDocs.termList.size(); m++) {
Term t = (Term) allDocs.termList.get(m);
output.write(t.getTerm() + " " +"TF:"+ t.getTf() + " " +"DF:"+ t.getDf()
+" "+"DDF:"+ t.getIdf() + " "+"TTF:"+t.getTtf()+" "+"TDF:"+t.getTdf()+" "+"ZTF:"+ t.getZTf()+" "+"ZDF:"+allDocs.documents.length);
output.newLine();
output.flush();
}
output.close();
input.close();
}
long end_time_count=System.currentTimeMillis();
System.out.println("用时(毫秒):"+(end_time_count-begin_time_count)); if(temp.isDirectory()){//如果是子文件夹
copyFolder(oldPath+"/"+file[i],newPath+"/"+file[i]);
}
}
}
catch (Exception e) {
System.out.println("操作出错");
e.printStackTrace();
}
}
}
JAVA这个我不太懂 ,哥们祝你好运!
我刚试了下,只能输出一半,是不是跟循环有关? BufferedReader input = new BufferedReader(new FileReader("E:\\1.txt"));
String[] string=new String[input.toString().length()];
for(int j=0;j<string.length;j++){
String s = input.readLine();
string[j] = s;
System.out.println(s);//输出结果不完整???
}
String str=null;
while((str=input.readLine())!=null){
System.out.println(str);
}
这样的循环我知道,但是要把这个txt文件完全放到数组中,然后再读出来用,这是上面程序要用的,大哥知道怎么改吗?
BufferedReader input = new BufferedReader(new FileReader("E:\\1.txt"));
String[] string=new String[input.toString().length()];
while ((line = input.readLine()) != null){
for(int j=0;j<string.length;j++){
String s = input.readLine();
string[j] = s;
System.out.println(s);
}
}
readLine() 读一行 while中读了2次
String str=null;
List<String> list=new ArrayList<String>();
while((str=input.readLine())!=null){
list.add(str);
}
String[] array=list.toArray(new String[0]);
String[] string = new String[input.toString().length()];
int i=0;
while((s=input.read())!=-1){
Character c= new Character((char)s);
string[i] = c.toString();
i++;
}