100分求讨论多线程文本文件单词统计

现在有一个文本文件叫做index.txt，里面有很多行，每一行都是一个文件名，每个文件名所指向的文件里都包含一些英文的段落（以空格隔开的单词）。现在想在读每一行的时候创建一个进程，该进程统计每个英文单词出现的次数并且最后汇总所有单词的出现次数并字母顺序打印如
apple 3
beach 1
me 5
zero 5本人已经完成了单词统计的部分（尚未完成重复词计数），突然觉得自己的想法可能满足不了实现这个程序的要求，所以散100分和高手们讨论求点思路。如果能有高手给几段代码是最好，但是也十分欢迎一起讨论你的想法。完成部分如下：
ZhengProject4.java
import edu.truman.cs260.Zheng.RunIndex;
import edu.truman.cs260.Zheng.Counter;
import edu.truman.cs260.Zheng.Word;import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
//import java.util.ArrayList;
import java.util.ArrayList;/**
* @author Tian
*
*/
public class ZhengProject4 { /**
* @param args
*/
public static void main(String[] args) {
Runnable counter = new RunIndex(null);
//ArrayList<String> names = new ArrayList<String>();
File file = new File("index.txt");
BufferedReader reader = null;

try {
reader = new BufferedReader(new FileReader(file));
String text = null; while ((text = reader.readLine()) != null)
{
//System.out.println(text);
counter = new RunIndex(text);
Thread t1 = new Thread(counter);
t1.start();
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (reader != null) reader.close();
} catch (IOException e) {
e.printStackTrace();
}
//ArrayList <Counter> counters;
//Counter counters = new Counter(null);
//for (Counter counter : counters) System.out.println (counters.getCounter()+ ": "+counters.getEvent());
}
}
}
RunIndex.java
/**
*
*/
package edu.truman.cs260.Zheng;import java.lang.Runnable;
import java.io.File;
import java.util.ArrayList;/**
* @author Tian
*
*/
public class RunIndex implements Runnable{

private String indexName; public RunIndex(String aIndexName) {
indexName = aIndexName;
}

public void run() {
TextReader parser=new TextReader(new File(indexName));
         ArrayList <Word> words = parser.readIn();
         for (Word word : words) System.out.println (word.getWord()+ ": "+word.getEvent());
}}
TextReader.java:
/**
*
*/
package edu.truman.cs260.Zheng;import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.StringTokenizer; /**
* @author Tian
*
*/
public class TextReader {

private File file; public TextReader (File aFile) {
assert aFile.exists() && aFile.isFile();
file = aFile;
}

public ArrayList <Word> readIn(){
         BufferedReader input = null;
         try {
             ArrayList <Word> words = new ArrayList <Word> ();
             input = new BufferedReader(new FileReader(file));
             String line;
             while ((line = input.readLine()) != null) {
             StringTokenizer tokenizer=new StringTokenizer(line);
                 while (tokenizer.hasMoreTokens()) {
                 String token=tokenizer.nextToken();
                     Word word=new Word(token);
                     int index=words.indexOf(word);
                     if (index==-1) {
                     word.increase();
                     words.add(word);
                     } else words.get(index).increase();
                  }
              }
              input.close();
              Collections.sort(words);
              return   words;
         } catch (Exception e) {
         e.printStackTrace();
         return null;
         } finally {
         if (input!= null) {
         try {
                  input.close();
                 } catch(IOException e) {
                   }
             }
         }
}

/* public void readIn () { //StringBuffer contents = new StringBuffer();
BufferedReader reader = null;
ArrayList<String> names = new ArrayList<String>();

try {
reader = new BufferedReader(new FileReader(file));
String text = null;
String tester = null;

//Iterator<String> it = names.iterator();
while ((tester = reader.readLine()) != null)
{
System.out.println(tester);
names.add(text);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (reader != null) reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
System.out.println(names);
}*/
}
Word.java:
/**
*
*/
package edu.truman.cs260.Zheng;/**
* @author Tian
*
*/
public class Word implements Comparable <Word> {

private String word;
private int event;

public Word(String aWord) {
assert aWord != null;
word = aWord;
}

public String getWord() {
return word;
}

public int getEvent() {
return event;
}

public void increase(){
event++;
}

public boolean equals(Word another){
if (another == null)
return false;
if (another instanceof Word){
Word anotherWord = (Word)another;
return anotherWord.getWord().equals(word);
} else
return false;
}

public int compareTo (Word aWord) {
if (event < aWord.getEvent())
return -1;
else if (event == aWord.getEvent())
return 0;
else
return 1;
}}
现在主要面临的困境有两个，一是在主类中我不同进程的实现是通过一个while循环，所以我无法对他们具体控制。如果想让他们对一个全局计数变量或对象操作的话，是需要锁进程之类的吧？实在是初学，没有十分理解那部分，希望能有人指导一下怎么改写变成带锁的不同进程。二是每读一个单词，我会对Word类的一个对象进行操作，增加计数什么的，但是不同的线程是相互独立的，怎么才能让这个Word被公用呢？直接用static变量不太符合面向对象的思路，所以我不是很想用，有没有解决办法？或者有没有高手觉得根据我的要求，我程序的结构上有什么问题？欢迎大家指出！一百分求讨论。十分感谢！

解决方案 »

免费领取超大流量手机卡，每月29元包185G流量+100分钟通话, 中国电信官方发货

了解下Map/Reduce思想吧，其实跟你这个很有相似性。总的来说，从速度考虑，不需要所有线程都操作同一个全局计数变量，这样加锁会严重限制并发能力。建议：
1、线程数量不要太多，你的CPU没有那么多核，磁盘IO阻塞也没有达到那么严重能让你复用CPU；除非你打算用分布式，也就是多台电脑协同完成；
2、每个线程一个自己的全局计数变量，这样就不需要加锁了，性能高；
3、线程处理完毕自己所负责的文件后，就把自己的计数器返回给主线程（负责分配任务的），主线程将其计数值合并到主线程所维护的总计数器中；
4、給空闲出来的线程分配下一行（新的待处理文件）。
用一个共享的map来做统计，每个线程操作这个map时，对map锁一下就可以了，不过你没一行创建一个线程，如果10万行就10万个线程，消耗系统资源，可以把每一行放到一个队列，或者多个线程一起读index文件就好了//每一行创建一个线程的
public class RunIndex implements Runnable{


    private String indexName;
    private Map<Word, Integer> map;    public RunIndex(String aIndexName, Map<Word, Integer> map) {
        indexName = aIndexName;
        this.map = map;
    }

    public void run() {
         TextReader parser=new TextReader(new File(indexName));
         ArrayList <Word> words = parser.readIn();
         for (Word word : words) {//System.out.println (word.getWord()+ ": "+word.getEvent());
             synchronized(map) { //锁共享资源
                 if (map.containsKey(words)) {
                     map.put(words, map.get(words) + 1);
                 } else {
                     map.put(words, 1);
                 }
             }
         }
    }}public class ZhengProject4 {    /**
     * @param args
     */
    public static void main(String[] args) {
        Runnable counter = new RunIndex(null);
        //ArrayList<String> names = new ArrayList<String>();
        File file = new File("index.txt");
        BufferedReader reader = null;
        Map<Word, Integer> map = new HashMap<Word, Integer>(); //创建共享资源
        List<Thread> list = new ArrayList<Thread>();
        try {
            reader = new BufferedReader(new FileReader(file));
            String text = null;            while ((text = reader.readLine()) != null)
            {
                //System.out.println(text);
                counter = new RunIndex(text, map); //每个线程使用共享资源
                Thread t1 = new Thread(counter);
                list.add(t1);
                t1.start();
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                if (reader != null) reader.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            //ArrayList <Counter> counters;
            //Counter counters = new Counter(null);
            //for (Counter counter : counters) System.out.println (counters.getCounter()+ ": "+counters.getEvent());
        }        while (true) { //等待线程执行结束
            boolean end = true;
            for (Thread t : list) {
                if (t.isAlive()) { //判断每个线程是否都还活着
                    end = false; //有一个活着就说明还没有完全结束
                    break;
                }
            }
            if (end) break; //都没有活着的线程，说明线程都已结束
            Thread.yield();
        }        //打印结果
        for (Map.Entry<Word, Integer> e : map.entrySet()) {
            System.out.printf("%s, %s", e.getKey(), e.getValue());
        }
    }
}
谢谢三楼！还没有来得及在您的例子上调试，我这会儿是个学习用的小程序，目前不准备考虑效率，主要就是为了理解这个概念，但是现在卡住了，就理解不了了。我自己的index.txt是四行的，每个文本文件里就一句话，应该不会出现那种很多很多的极端情况。
我估计是你想复杂了，先重述下你的数据结构：
◎ 你有一个主文件 index.txt，里面列出了很多需要统计单词数量的子文件，相当于一个文件列表；
◎ 需要进行统计的子文件，内容都是英文的段落（以空格隔开的单词）。不清楚你单个子文件的大小，但我相信如果只是几十兆这个规模的话，是肯定不需要针对单个子文件使用多线程的。
那么方案其实挺简单，可以这么说，主体基本上就跟3楼阿宝的模型是类似的，但有两种不同：
1、我建议不要每行（每个子文件）启动一个线程，因为如果你index.txt行数太多的话，比如数万行，你这个代价太高了，资源都浪费到线程切换去了；如果你index.txt行数不多的话，比如才几十行，那无所谓；也就是这里建议引入线程池的概念而已了，限制运行线程的规模；
2、我建议线程各自独立计数器，处理完毕后再将线程的计数器合并到主计数器中；你可以选择在每个子文件处理完毕时进行合并，也可以选择在所有子文件全部处理完毕时再一次性合并；好处是，以后你如果想增加这种：暂停、恢复、容错等功能，就比较容易了。总的来说，如果index.txt规模不大，3楼阿宝的模型就能够非常好的完成任务了。
hadoop中mapreduce的demo就是关于文本文件单词统计的，楼主可以参考下。
我对3楼的框架进行了应用，程序做了点小修改，结果输出结果成了这样：edu.truman.cs260.Zheng.Word@721cdeff 1
edu.truman.cs260.Zheng.Word@e76cbf7 1
edu.truman.cs260.Zheng.Word@17dfafd1 1
edu.truman.cs260.Zheng.Word@272d7a10 1
edu.truman.cs260.Zheng.Word@1aa8c488 1
edu.truman.cs260.Zheng.Word@2352544e 1
edu.truman.cs260.Zheng.Word@7ecec0c5 1
edu.truman.cs260.Zheng.Word@22998b08 1
edu.truman.cs260.Zheng.Word@457471e0 1
edu.truman.cs260.Zheng.Word@7a6d084b 1
edu.truman.cs260.Zheng.Word@5e8fce95 1
edu.truman.cs260.Zheng.Word@3dfeca64 1
edu.truman.cs260.Zheng.Word@1948cc8c 1
edu.truman.cs260.Zheng.Word@c3bb2b8 1
edu.truman.cs260.Zheng.Word@3343c8b3 1
edu.truman.cs260.Zheng.Word@5fe04cbf 1原本应该前面是单词后面是计数，结果前面的单词都成了这种类似地址的东西。请问一下这个是map的问题还是 stringtokenizer的问题？谢谢！
已经解决了，不是图的原因。在这个模型下我有种多余的对象叫Word，这些乱七八糟的输出的正是那些个对象。我把那东西删了全用String以后就全好了。谢谢！
十分感谢楼上诸君的参与与帮助，我刚刚完成了这个程序，准备结贴。为了防止以后有人会搜索到这里来能有所借鉴，我将我的源码在此附上。谢谢！
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;import edu.truman.cs260.Zheng.RunIndex;
/**
* @author Tian
* CS260 Final Project
* Project 4 - Multithreaded Programming
* This is a program that makes a list of all the words in a collection of
* text files store in "index.txt", and the frequencies with which how many
* time one word appears. This program uses one separate thread for each file
* to count words in each file and total them into a map structure.
*/
public class ZhengProject4 { /**
* Main method
*/
public static void main(String[] args) {
Runnable counter = new RunIndex(null, null);
        File file = new File("index.txt");
        BufferedReader reader = null;
        Map<String, Integer> map = new TreeMap<String, Integer>();
        List<Thread> list = new ArrayList<Thread>();

        // Read index.txt and for each line, create a thread
        try {
            reader = new BufferedReader(new FileReader(file));
            String text = null;            while ((text = reader.readLine()) != null)
            {
                counter = new RunIndex(text, map);
                Thread t1 = new Thread(counter);
                list.add(t1);
                t1.start();
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                if (reader != null) reader.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }        // Check if all the threads are finished
        while (true) {
            boolean end = true;
            for (Thread t : list) {
                if (t.isAlive()) {
                    end = false;
                    break;
                }
            }
            if (end) break;
            Thread.yield();
        }        // Output result
        for (Map.Entry<String, Integer> e : map.entrySet()) {
            System.out.print(e.getKey());
            System.out.print(" ");
            System.out.println(e.getValue());
        }
}
}
package edu.truman.cs260.Zheng;import java.io.File;
import java.util.ArrayList;
import java.util.Map;/**
* @author Tian
* This is the class that implements Runnable interface.
* It executes every time a thread is created.
*/
public class RunIndex implements Runnable{

    private String indexName;
    private Map<String, Integer> map;

    /**
     * Constructor of RunIndex object
     * @param aIndexName name of a text file need to be processed
     * @param map the map contains result
     */
    public RunIndex(String aIndexName, Map<String, Integer> map) {
        indexName = aIndexName;
        this.map = map;
    }    /**
     * This is the method of the thread's task.
     */
    public void run() {
         TextReader parser=new TextReader(new File(indexName));
         ArrayList<String> words = parser.readIn();
         for (String word : words) {
             synchronized(map) {
                 if (map.containsKey(word)) {
                     map.put(word, map.get(word) + 1);
                 } else {
                     map.put(word, 1);
                 }
             }
         }
    }
}package edu.truman.cs260.Zheng;import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.StringTokenizer; /**
* @author Tian
* This is a class of text reader. It reads a text file and
* separate words in that file.
*/
public class TextReader {

private File file;

/**
* Constructor of TextReader object
* @param aFile the file that need to be proccessed
*/
public TextReader (File aFile) {
file = aFile;
}

/**
* This is a method that read text file in and separate words.
* @return words an ArrayList contains separated words
*/
public ArrayList <String> readIn(){
         BufferedReader input = null;
         try {
             ArrayList <String> words = new ArrayList <String> ();
             input = new BufferedReader(new FileReader(file));
             String line;
             while ((line = input.readLine()) != null) {
             StringTokenizer tokenizer=new StringTokenizer(line);
                 while (tokenizer.hasMoreTokens()) {
                 String token=tokenizer.nextToken();
                  String word=new String(token);
                  words.add(word);
                  }
              }
              input.close();
              return words;
         } catch (Exception e) {
         e.printStackTrace();
         return null;
         } finally {
         if (input!= null) {
         try {
                  input.close();
                 } catch(IOException e) {
                   }
             }
         }
}
}