package treeroot.util.wordcount; /** * The class is the elment of the Word Set return by WordCount. * The instance have the word value and the apperance times. * <strong>NOTE</strong>:the word ignore the case, * so word "hello","Hello","HELLO" are * the same word. * * @author treeroot * @version 1.0, 04/12/06 * @see WordCount */ public class Word{ private String value; /** * Construct a Word object,has the count 1. */ public Word(String value){ this.value=value.toLowerCase(); } private int count=1;
//this method is only invoked by WordCount class protected void increase(){ count++; }
/** * @return the word as the lower case. */ public String getWord(){ return value; } /** * @return the apperance times of this word. */ public int getCount(){ return count; } /** * @return if the word was the same ignore case,return true. */ public boolean equals(Object o){ return (o instanceof Word)&&(((Word)o).value.equals(value)); } /** * @return the hashCode of the word. */ public int hashCode(){ return value.hashCode(); } } package treeroot.util.wordcount; /** * WordCount provides a static method to count the words of a text. * You can give a hyphens that the words can use to connect letters, * or it will be use the default hyphens. * the dafault hyphens are '-','_',''',but the first letter must be * a English letter(a-z,A-Z). * SO: a-b,a-b,it's are words,but the -ab,_ab,'as are't a word. * You can use a comparator to sort the Set,by dictionary or frequency, * if you don't give a comparator,dicitonary comparator will be used. * */import java.util.Map; import java.util.Set; import java.util.HashMap; import java.util.TreeSet; import java.util.Comparator; import java.util.Collections;public class WordCount { //the default hyphen collection. private static String regex="\\-_'";
/** * return the words as a Set by default comparator and hyphens * @see #getWordCount(String,String,Comparator) */ public static Set getWordCount(String text){ return getWordCount(text,regex,WordCount.DICTIONARY_ORDER); }
/** * return the words as a Set by the default comparator * @see #getWordCount(String,String,Comparator) */ public static Set getWordCount(String text,String regex){ return getWordCount(text,regex,WordCount.DICTIONARY_ORDER); }
/** * return the words as a Set by the default hyphens * @see #getWordConut(String,String,Comparator) */ public static Set getWordCount(String text,Comparator order){ return getWordCount(text,regex,order); } /** * return the words as a Set by the text,the word are all changed to * lower case. * @param text the English text you want to split. * @param regex the hyphens that the word can use. * @param order the order of the Set returned by. * @return the word Set that the text contains. */ public static Set getWordCount(String text,String regex,Comparator order){ Map map=new HashMap(); String split1="[^a-zA-Z"+regex+"]+"; String split2="[^a-zA-Z]+"+regex+"[^a-zA-Z]*"; String split3="[^a-zA-Z]*"+regex+"[^a-zA-Z]+";
Set sort=new TreeSet(order); sort.addAll(map.keySet());
return Collections.unmodifiableSet(sort); }
/** * the sort constant of DICTIONARY,the default sort contant. */ public static final Comparator DICTIONARY_ORDER=new Comparator(){ public int compare(Object o1,Object o2){ Word w1=(Word)o1; Word w2=(Word)o2; return w1.getWord().compareTo(w2.getWord()); } }; /** * the sort contant of FREQUENCY,the words was sorted by the apperance * times in the Set. */ public static final Comparator FREQUENCY_ORDER =new Comparator(){ public int compare(Object o1,Object o2){ Word w1=(Word)o1; Word w2=(Word)o2; int i=w2.getCount()-w1.getCount(); if(i==0){ return w1.getWord().compareTo(w2.getWord()); } return i; } }; }
然后就定义token就是了
/**
* The class is the elment of the Word Set return by WordCount.
* The instance have the word value and the apperance times.
* <strong>NOTE</strong>:the word ignore the case,
* so word "hello","Hello","HELLO" are
* the same word.
*
* @author treeroot
* @version 1.0, 04/12/06
* @see WordCount
*/
public class Word{
private String value;
/**
* Construct a Word object,has the count 1.
*/
public Word(String value){
this.value=value.toLowerCase();
}
private int count=1;
//this method is only invoked by WordCount class
protected void increase(){
count++;
}
/**
* @return the word as the lower case.
*/
public String getWord(){
return value;
}
/**
* @return the apperance times of this word.
*/
public int getCount(){
return count;
}
/**
* @return if the word was the same ignore case,return true.
*/
public boolean equals(Object o){
return (o instanceof Word)&&(((Word)o).value.equals(value));
}
/**
* @return the hashCode of the word.
*/
public int hashCode(){
return value.hashCode();
}
}
package treeroot.util.wordcount;
/**
* WordCount provides a static method to count the words of a text.
* You can give a hyphens that the words can use to connect letters,
* or it will be use the default hyphens.
* the dafault hyphens are '-','_',''',but the first letter must be
* a English letter(a-z,A-Z).
* SO: a-b,a-b,it's are words,but the -ab,_ab,'as are't a word.
* You can use a comparator to sort the Set,by dictionary or frequency,
* if you don't give a comparator,dicitonary comparator will be used.
*
*/import java.util.Map;
import java.util.Set;
import java.util.HashMap;
import java.util.TreeSet;
import java.util.Comparator;
import java.util.Collections;public class WordCount
{
//the default hyphen collection.
private static String regex="\\-_'";
/**
* return the words as a Set by default comparator and hyphens
* @see #getWordCount(String,String,Comparator)
*/
public static Set getWordCount(String text){
return getWordCount(text,regex,WordCount.DICTIONARY_ORDER);
}
/**
* return the words as a Set by the default comparator
* @see #getWordCount(String,String,Comparator)
*/
public static Set getWordCount(String text,String regex){
return getWordCount(text,regex,WordCount.DICTIONARY_ORDER);
}
/**
* return the words as a Set by the default hyphens
* @see #getWordConut(String,String,Comparator)
*/
public static Set getWordCount(String text,Comparator order){
return getWordCount(text,regex,order);
}
/**
* return the words as a Set by the text,the word are all changed to
* lower case.
* @param text the English text you want to split.
* @param regex the hyphens that the word can use.
* @param order the order of the Set returned by.
* @return the word Set that the text contains.
*/
public static Set getWordCount(String text,String regex,Comparator order){
Map map=new HashMap();
String split1="[^a-zA-Z"+regex+"]+";
String split2="[^a-zA-Z]+"+regex+"[^a-zA-Z]*";
String split3="[^a-zA-Z]*"+regex+"[^a-zA-Z]+";
String reg = "("+split2+")|("+split3+")|("+split1+")";
String[] words = text.split(reg);
for(int i=0;i<words.length;i++){
Object o=new Word(words[i]);
if(map.containsKey(o)){
((Word)map.get(o)).increase();
}
else {
map.put(o,o);
}
}
Set sort=new TreeSet(order);
sort.addAll(map.keySet());
return Collections.unmodifiableSet(sort);
}
/**
* the sort constant of DICTIONARY,the default sort contant.
*/
public static final Comparator DICTIONARY_ORDER=new Comparator(){
public int compare(Object o1,Object o2){
Word w1=(Word)o1;
Word w2=(Word)o2;
return w1.getWord().compareTo(w2.getWord());
}
};
/**
* the sort contant of FREQUENCY,the words was sorted by the apperance
* times in the Set.
*/
public static final Comparator FREQUENCY_ORDER =new Comparator(){
public int compare(Object o1,Object o2){
Word w1=(Word)o1;
Word w2=(Word)o2;
int i=w2.getCount()-w1.getCount();
if(i==0){
return w1.getWord().compareTo(w2.getWord());
}
return i;
}
};
}