java 过滤敏感词替换 求java过滤敏感词替换** 如:TMD 缺德 你TMD,也太缺德了,太变态了吧TM 替换成:你***,也太缺德了,太**了吧**注意:一个字就是一个*有代码的话发我新浪邮箱:[email protected] 谢谢! 解决方案 » 免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货 嗯嗯 ,,写个过滤器 ,,replaceAll()处理之~~ 很简单,用正则表达式即可判断,代码如下 邮箱就不发了public static void main(String[] args) { String str = "你TMD,也太缺德了,太变态了吧TM "; String regex = ".*[TMD,TM].*"; Pattern pat = Pattern.compile(regex); Matcher mat = pat.matcher(str); String s = ""; if (mat.matches()) { s = mat.group().replace("TMD", "*").replace("TM", "*"); } System.out.println(s); } 字符串匹配。实现并不难只是看你是否要求高效率了,看样子你这个应该也是聊天服务器中处理敏感词。参考下这个算法,你自己结合来作出自己程序:http://blog.csdn.net/jiajianhui2009/article/details/6229740 public class WmParser { private static Log log = LogFactory.getLog(WmParser.class); public static WmParser wmParser; private static String CHARSET = "ISO-8859-1"; static { try { log.debug("Instantiating WmParser...."); wmParser = new WmParser(); InputStream in = WmParser.class.getResourceAsStream("bad_words_zh_CN.txt"); BufferedReader reader = new BufferedReader( new InputStreamReader(in)); String line = null; while ((line = reader.readLine()) != null) {// wmParser.addFilterKeyWord(new String(line.getBytes(), "ISO-8859-1"), 1); wmParser.addFilterKeyWord(line, 1);// String[] badWords = line.split("!");// if (badWords.length == 0){// continue;// }// if (badWords[0].equals("")){// continue;// }// try {// wmParser.addFilterKeyWord(badWords[0], Integer.valueOf(badWords[1]));// } catch (NumberFormatException e) {// log.error("NumberFormatException in Instantiating WmParser's badWords level:" + e);// wmParser.addFilterKeyWord(badWords[0], Integer.valueOf(1));// } } reader.close(); in.close(); } catch (Exception e) { log.error("Exception in Instantiating WmParser:" + e); e.printStackTrace(); } } protected WmParser(){ } public static WmParser getInstance(){ return wmParser; } private boolean initFlag = false; private UnionPatternSet unionPatternSet = new UnionPatternSet(); private int maxIndex = (int) java.lang.Math.pow(2, 16); private int shiftTable[] = new int[maxIndex]; public Vector<AtomicPattern> hashTable[] = new Vector[maxIndex]; private UnionPatternSet tmpUnionPatternSet = new UnionPatternSet(); public static void main(String args[]) { try { WmParser filterEngine = WmParser.getInstance(); Vector<Integer> levelSet = new Vector<Integer>(); String str = "单个的政治,政治运动和强奸和shit"; SimpleDateFormat sf = new SimpleDateFormat("HH:mm:ss.SSS"); System.out.println("文本长度:" + str.length()); System.out.println("敏感词汇总数:" + filterEngine.tmpUnionPatternSet.getSet().size()); Date start = new Date(System.currentTimeMillis()); System.out.println("过滤开始:" + sf.format(start)); System.out.println(str); System.out.println(filterEngine.parse(new String(str.getBytes(), "ISO_8859-1"), levelSet)); Date end = new Date(System.currentTimeMillis()); System.out.println("过滤完毕:" + sf.format(end)); System.out.println("文本中出现敏感词个数:" + levelSet.size()); System.out.println("耗时:" + (end.getTime() - start.getTime()) + "ms"); } catch (Exception e) { e.printStackTrace(); } } public boolean addFilterKeyWord(String keyWord, int level) { if (initFlag == true) return false; UnionPattern unionPattern = new UnionPattern(); Pattern pattern = new Pattern(keyWord); AtomicPattern atomicPattern = new AtomicPattern(pattern); unionPattern.addNewAtomicPattrn(atomicPattern); unionPattern.setLevel(level); atomicPattern.setBelongUnionPattern(unionPattern); tmpUnionPatternSet.addNewUnionPattrn(unionPattern); return true; } public String parse(String content, Vector<Integer> levelSet){ try { if (initFlag == false) init(); Vector<AtomicPattern> aps = new Vector<AtomicPattern>(); StringBuilder sb = new StringBuilder(); char checkChar; for (int i = 0; i < content.length();) { checkChar = content.charAt(i); if (shiftTable[checkChar] == 0) { Vector<AtomicPattern> tmpAps = new Vector<AtomicPattern>(); Vector<AtomicPattern> destAps = hashTable[checkChar]; int match = 0; for (int j = 0; j < destAps.size(); j++) { AtomicPattern ap = destAps.get(j); if (ap.findMatchInString(content.substring(0, i + 1))){ String patternStr = ap.getPattern().str; if (match > 0){ sb.setLength(sb.length() - patternStr.length()); } else { sb.setLength(sb.length() - patternStr.length() + 1); } appendStar(patternStr, sb); tmpAps.add(ap); match++; } } aps.addAll(tmpAps); if (tmpAps.size() <= 0){ sb.append(checkChar); } i++; } else { if (i + shiftTable[checkChar] <= content.length()){ sb.append(content.substring(i, i + shiftTable[checkChar])); } else { sb.append(content.substring(i)); } i = i + shiftTable[checkChar]; } } parseAtomicPatternSet(aps, levelSet); return sb.toString(); } catch (Exception e) { log.error(e); e.printStackTrace(); } return ""; } private static void appendStar(String patternStr, StringBuilder sb){ for (int c = 0;c < patternStr.length(); c++){ char ch = patternStr.charAt(c); if ((ch >= 0x4e00 && ch <= 0x9FA5) || (ch >= 0xF900 && ch <= 0xFA2D)){ sb.append("*"); } else { sb.append("*"); } } } private void parseAtomicPatternSet(Vector<AtomicPattern> aps, Vector<Integer> levelSet) { while (aps.size() > 0) { AtomicPattern ap = aps.get(0); UnionPattern up = ap.belongUnionPattern; if (up.isIncludeAllAp(aps)) { levelSet.add(new Integer(up.getLevel())); } aps.remove(0); } } // shift table and hash table of initialize private void init() { initFlag = true; for (int i = 0; i < maxIndex; i++) hashTable[i] = new Vector<AtomicPattern>(); shiftTableInit(); hashTableInit(); } public void clear() { tmpUnionPatternSet.clear(); initFlag = false; } private void shiftTableInit() { for (int i = 0; i < maxIndex; i++) shiftTable[i] = 2; Vector<UnionPattern> upSet = tmpUnionPatternSet.getSet(); for (int i = 0; i < upSet.size(); i++) { Vector<AtomicPattern> apSet = upSet.get(i).getSet(); for (int j = 0; j < apSet.size(); j++) { AtomicPattern ap = apSet.get(j); Pattern pattern = ap.getPattern(); if (shiftTable[pattern.charAtEnd(1)] != 0) shiftTable[pattern.charAtEnd(1)] = 1; if (shiftTable[pattern.charAtEnd(0)] != 0) shiftTable[pattern.charAtEnd(0)] = 0; } } } private void hashTableInit() { Vector<UnionPattern> upSet = tmpUnionPatternSet.getSet(); for (int i = 0; i < upSet.size(); i++) { Vector<AtomicPattern> apSet = upSet.get(i).getSet(); for (int j = 0; j < apSet.size(); j++) { AtomicPattern ap = apSet.get(j); Pattern pattern = ap.getPattern(); if (pattern.charAtEnd(0) != 0) { hashTable[pattern.charAtEnd(0)].add(ap); } } } }}class Pattern { // string Pattern(String str) { this.str = str; } public char charAtEnd(int index) { if (str.length() > index) { return str.charAt(str.length() - index - 1); } else return 0; } public String str; public String getStr() { return str; };}class AtomicPattern {// public boolean findMatchInString(String str) throws Exception {// String patStr = new String(this.pattern.str.getBytes("ISO-8859-1"), "UTF-8");// str = new String(str.getBytes("ISO-8859-1"), "UTF-8");// if (patStr.length() > str.length())// return false;// int beginIndex = str.lastIndexOf(patStr.charAt(0) + "");// if (beginIndex != -1){// String eqaulLengthStr = str.substring(beginIndex);// if (patStr.equalsIgnoreCase(eqaulLengthStr))// return true;// }// return false;// } public boolean findMatchInString(String str) { if (this.pattern.str.length() > str.length()) return false; int beginIndex = str.lastIndexOf(this.pattern.str.charAt(0) + ""); if (beginIndex != -1){ String eqaulLengthStr = str.substring(beginIndex); if (this.pattern.str.equalsIgnoreCase(preConvert(eqaulLengthStr))) return true; } return false; } private String preConvert(String content) { String retStr = new String(); for (int i = 0; i < content.length(); i++) { char ch = content.charAt(i); if (this.isValidChar(ch)) { retStr = retStr + ch; } } return retStr; } private boolean isValidChar(char ch) { if ((ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z')) return true; if ((ch >= 0x4e00 && ch <= 0x9FA5) || (ch >= 0xF900 && ch <= 0xFA2D)) return true; return false; } AtomicPattern(Pattern pattern) { this.pattern = pattern; }; private Pattern pattern; public UnionPattern belongUnionPattern; public UnionPattern getBelongUnionPattern() { return belongUnionPattern; } public void setBelongUnionPattern(UnionPattern belongUnionPattern) { this.belongUnionPattern = belongUnionPattern; } public Pattern getPattern() { return pattern; } public void setPattern(Pattern pattern) { this.pattern = pattern; }}class SameAtomicPatternSet { SameAtomicPatternSet() { SAPS = new Vector<AtomicPattern>(); }; public Vector<AtomicPattern> SAPS;}class UnionPattern { // union string UnionPattern() { this.apSet = new Vector<AtomicPattern>(); } public Vector<AtomicPattern> apSet; public void addNewAtomicPattrn(AtomicPattern ap) { this.apSet.add(ap); } public Vector<AtomicPattern> getSet() { return apSet; } public boolean isIncludeAllAp(Vector<AtomicPattern> inAps) { if (apSet.size() > inAps.size()) return false; for (int i = 0; i < apSet.size(); i++) { AtomicPattern ap = apSet.get(i); if (isInAps(ap, inAps) == false) return false; } return true; } private boolean isInAps(AtomicPattern ap, Vector<AtomicPattern> inAps) { for (int i = 0; i < inAps.size(); i++) { AtomicPattern destAp = inAps.get(i); if (ap.getPattern().str.equalsIgnoreCase(destAp.getPattern().str)) return true; } return false; } public void setLevel(int level) { this.level = level; } public int getLevel() { return this.level; } private int level;}class UnionPatternSet { // union string set UnionPatternSet() { this.unionPatternSet = new Vector<UnionPattern>(); } public void addNewUnionPattrn(UnionPattern up) { this.unionPatternSet.add(up); } public Vector<UnionPattern> unionPatternSet; public Vector<UnionPattern> getSet() { return unionPatternSet; } public void clear() { unionPatternSet.clear(); }}附上我之前改写的一个吧。 jacod问题 hibernate。重启服务器时会使原来数据库数据丢失,怎么办 正则表达式问题 log4j的tomcat控制台为什么打不出来?? 一个webservice应用采用axis,如何返回List 菜鸟问题,等大侠来拿分!!! ====有奖竟猜====灰熊Vs火箭=== 截止时间:4月12日-10点59分 请看看为什么CachedRowSet更新时提示acceptChanges 失败! 关于weblogic数据库连接池的问题 servletinputstream怎末转都是中文乱码 各位大大推荐个开源的建站系统 什么导致了c3p0死锁.
public static void main(String[] args) {
String str = "你TMD,也太缺德了,太变态了吧TM ";
String regex = ".*[TMD,TM].*";
Pattern pat = Pattern.compile(regex);
Matcher mat = pat.matcher(str);
String s = "";
if (mat.matches()) {
s = mat.group().replace("TMD", "*").replace("TM", "*");
}
System.out.println(s);
}
public class WmParser {
private static Log log = LogFactory.getLog(WmParser.class);
public static WmParser wmParser;
private static String CHARSET = "ISO-8859-1";
static {
try {
log.debug("Instantiating WmParser....");
wmParser = new WmParser();
InputStream in = WmParser.class.getResourceAsStream("bad_words_zh_CN.txt");
BufferedReader reader = new BufferedReader(
new InputStreamReader(in));
String line = null;
while ((line = reader.readLine()) != null) {
// wmParser.addFilterKeyWord(new String(line.getBytes(), "ISO-8859-1"), 1);
wmParser.addFilterKeyWord(line, 1);
// String[] badWords = line.split("!");
// if (badWords.length == 0){
// continue;
// }
// if (badWords[0].equals("")){
// continue;
// }
// try {
// wmParser.addFilterKeyWord(badWords[0], Integer.valueOf(badWords[1]));
// } catch (NumberFormatException e) {
// log.error("NumberFormatException in Instantiating WmParser's badWords level:" + e);
// wmParser.addFilterKeyWord(badWords[0], Integer.valueOf(1));
// }
}
reader.close();
in.close();
} catch (Exception e) {
log.error("Exception in Instantiating WmParser:" + e);
e.printStackTrace();
}
}
protected WmParser(){
}
public static WmParser getInstance(){
return wmParser;
}
private boolean initFlag = false;
private UnionPatternSet unionPatternSet = new UnionPatternSet();
private int maxIndex = (int) java.lang.Math.pow(2, 16);
private int shiftTable[] = new int[maxIndex];
public Vector<AtomicPattern> hashTable[] = new Vector[maxIndex];
private UnionPatternSet tmpUnionPatternSet = new UnionPatternSet(); public static void main(String args[]) {
try {
WmParser filterEngine = WmParser.getInstance();
Vector<Integer> levelSet = new Vector<Integer>();
String str = "单个的政治,政治运动和强奸和shit";
SimpleDateFormat sf = new SimpleDateFormat("HH:mm:ss.SSS");
System.out.println("文本长度:" + str.length());
System.out.println("敏感词汇总数:" + filterEngine.tmpUnionPatternSet.getSet().size());
Date start = new Date(System.currentTimeMillis());
System.out.println("过滤开始:" + sf.format(start));
System.out.println(str);
System.out.println(filterEngine.parse(new String(str.getBytes(), "ISO_8859-1"), levelSet));
Date end = new Date(System.currentTimeMillis());
System.out.println("过滤完毕:" + sf.format(end));
System.out.println("文本中出现敏感词个数:" + levelSet.size());
System.out.println("耗时:" + (end.getTime() - start.getTime()) + "ms");
} catch (Exception e) {
e.printStackTrace();
}
}
public boolean addFilterKeyWord(String keyWord, int level) {
if (initFlag == true)
return false;
UnionPattern unionPattern = new UnionPattern();
Pattern pattern = new Pattern(keyWord);
AtomicPattern atomicPattern = new AtomicPattern(pattern);
unionPattern.addNewAtomicPattrn(atomicPattern);
unionPattern.setLevel(level);
atomicPattern.setBelongUnionPattern(unionPattern);
tmpUnionPatternSet.addNewUnionPattrn(unionPattern);
return true;
} public String parse(String content, Vector<Integer> levelSet){
try {
if (initFlag == false)
init();
Vector<AtomicPattern> aps = new Vector<AtomicPattern>();
StringBuilder sb = new StringBuilder();
char checkChar;
for (int i = 0; i < content.length();) {
checkChar = content.charAt(i);
if (shiftTable[checkChar] == 0) {
Vector<AtomicPattern> tmpAps = new Vector<AtomicPattern>();
Vector<AtomicPattern> destAps = hashTable[checkChar];
int match = 0;
for (int j = 0; j < destAps.size(); j++) {
AtomicPattern ap = destAps.get(j);
if (ap.findMatchInString(content.substring(0, i + 1))){
String patternStr = ap.getPattern().str;
if (match > 0){
sb.setLength(sb.length() - patternStr.length());
} else {
sb.setLength(sb.length() - patternStr.length() + 1);
}
appendStar(patternStr, sb);
tmpAps.add(ap);
match++;
}
}
aps.addAll(tmpAps);
if (tmpAps.size() <= 0){
sb.append(checkChar);
}
i++;
} else {
if (i + shiftTable[checkChar] <= content.length()){
sb.append(content.substring(i, i + shiftTable[checkChar]));
} else {
sb.append(content.substring(i));
}
i = i + shiftTable[checkChar];
}
}
parseAtomicPatternSet(aps, levelSet);
return sb.toString();
} catch (Exception e) {
log.error(e);
e.printStackTrace();
}
return "";
}
private static void appendStar(String patternStr, StringBuilder sb){
for (int c = 0;c < patternStr.length(); c++){
char ch = patternStr.charAt(c);
if ((ch >= 0x4e00 && ch <= 0x9FA5) || (ch >= 0xF900 && ch <= 0xFA2D)){
sb.append("*");
} else {
sb.append("*");
}
}
}
private void parseAtomicPatternSet(Vector<AtomicPattern> aps,
Vector<Integer> levelSet) {
while (aps.size() > 0) {
AtomicPattern ap = aps.get(0);
UnionPattern up = ap.belongUnionPattern;
if (up.isIncludeAllAp(aps)) {
levelSet.add(new Integer(up.getLevel()));
}
aps.remove(0);
}
} // shift table and hash table of initialize
private void init() {
initFlag = true;
for (int i = 0; i < maxIndex; i++)
hashTable[i] = new Vector<AtomicPattern>();
shiftTableInit();
hashTableInit();
} public void clear() {
tmpUnionPatternSet.clear();
initFlag = false;
} private void shiftTableInit() {
for (int i = 0; i < maxIndex; i++)
shiftTable[i] = 2;
Vector<UnionPattern> upSet = tmpUnionPatternSet.getSet();
for (int i = 0; i < upSet.size(); i++) {
Vector<AtomicPattern> apSet = upSet.get(i).getSet();
for (int j = 0; j < apSet.size(); j++) {
AtomicPattern ap = apSet.get(j);
Pattern pattern = ap.getPattern();
if (shiftTable[pattern.charAtEnd(1)] != 0)
shiftTable[pattern.charAtEnd(1)] = 1;
if (shiftTable[pattern.charAtEnd(0)] != 0)
shiftTable[pattern.charAtEnd(0)] = 0;
}
}
} private void hashTableInit() {
Vector<UnionPattern> upSet = tmpUnionPatternSet.getSet();
for (int i = 0; i < upSet.size(); i++) {
Vector<AtomicPattern> apSet = upSet.get(i).getSet();
for (int j = 0; j < apSet.size(); j++) {
AtomicPattern ap = apSet.get(j);
Pattern pattern = ap.getPattern();
if (pattern.charAtEnd(0) != 0) {
hashTable[pattern.charAtEnd(0)].add(ap);
}
}
}
}
}class Pattern { // string
Pattern(String str) {
this.str = str;
} public char charAtEnd(int index) {
if (str.length() > index) {
return str.charAt(str.length() - index - 1);
} else
return 0;
} public String str; public String getStr() {
return str;
};
}class AtomicPattern {
// public boolean findMatchInString(String str) throws Exception {
// String patStr = new String(this.pattern.str.getBytes("ISO-8859-1"), "UTF-8");
// str = new String(str.getBytes("ISO-8859-1"), "UTF-8");
// if (patStr.length() > str.length())
// return false;
// int beginIndex = str.lastIndexOf(patStr.charAt(0) + "");
// if (beginIndex != -1){
// String eqaulLengthStr = str.substring(beginIndex);
// if (patStr.equalsIgnoreCase(eqaulLengthStr))
// return true;
// }
// return false;
// }
public boolean findMatchInString(String str) {
if (this.pattern.str.length() > str.length())
return false;
int beginIndex = str.lastIndexOf(this.pattern.str.charAt(0) + "");
if (beginIndex != -1){
String eqaulLengthStr = str.substring(beginIndex);
if (this.pattern.str.equalsIgnoreCase(preConvert(eqaulLengthStr)))
return true;
}
return false;
}
private String preConvert(String content) {
String retStr = new String();
for (int i = 0; i < content.length(); i++) {
char ch = content.charAt(i);
if (this.isValidChar(ch)) {
retStr = retStr + ch;
}
}
return retStr;
}
private boolean isValidChar(char ch) {
if ((ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z')
|| (ch >= 'a' && ch <= 'z'))
return true;
if ((ch >= 0x4e00 && ch <= 0x9FA5) || (ch >= 0xF900 && ch <= 0xFA2D))
return true;
return false;
} AtomicPattern(Pattern pattern) {
this.pattern = pattern;
}; private Pattern pattern;
public UnionPattern belongUnionPattern; public UnionPattern getBelongUnionPattern() {
return belongUnionPattern;
} public void setBelongUnionPattern(UnionPattern belongUnionPattern) {
this.belongUnionPattern = belongUnionPattern;
} public Pattern getPattern() {
return pattern;
} public void setPattern(Pattern pattern) {
this.pattern = pattern;
}
}class SameAtomicPatternSet {
SameAtomicPatternSet() {
SAPS = new Vector<AtomicPattern>();
}; public Vector<AtomicPattern> SAPS;
}class UnionPattern { // union string
UnionPattern() {
this.apSet = new Vector<AtomicPattern>();
} public Vector<AtomicPattern> apSet; public void addNewAtomicPattrn(AtomicPattern ap) {
this.apSet.add(ap);
} public Vector<AtomicPattern> getSet() {
return apSet;
} public boolean isIncludeAllAp(Vector<AtomicPattern> inAps) {
if (apSet.size() > inAps.size())
return false;
for (int i = 0; i < apSet.size(); i++) {
AtomicPattern ap = apSet.get(i);
if (isInAps(ap, inAps) == false)
return false;
}
return true;
} private boolean isInAps(AtomicPattern ap, Vector<AtomicPattern> inAps) {
for (int i = 0; i < inAps.size(); i++) {
AtomicPattern destAp = inAps.get(i);
if (ap.getPattern().str.equalsIgnoreCase(destAp.getPattern().str))
return true;
}
return false;
} public void setLevel(int level) {
this.level = level;
} public int getLevel() {
return this.level;
} private int level;
}class UnionPatternSet { // union string set
UnionPatternSet() {
this.unionPatternSet = new Vector<UnionPattern>();
} public void addNewUnionPattrn(UnionPattern up) {
this.unionPatternSet.add(up);
} public Vector<UnionPattern> unionPatternSet; public Vector<UnionPattern> getSet() {
return unionPatternSet;
} public void clear() {
unionPatternSet.clear();
}
}
附上我之前改写的一个吧。