过滤敏感词组

1。请教如何过滤敏感词组，
如把骂人的话操你妈，傻逼，或一些反动的词组过滤掉。
2。谁有这类词的集合啊！

解决方案 »

免费领取超大流量手机卡，每月29元包185G流量+100分钟通话, 中国电信官方发货

public class BadWordsFilter
{
    private HashSet<string> hash = new HashSet<string>();
    private byte[] fastCheck = new byte[char.MaxValue];
    private byte[] fastLength = new byte[char.MaxValue];
    private BitArray charCheck = new BitArray(char.MaxValue);
    private BitArray endCheck = new BitArray(char.MaxValue);
    private int maxWordLength = 0;
    private int minWordLength = int.MaxValue;
    public BadWordsFilter()
    {    }
    public void Init(string[] badwords)
    {
        foreach (string word in badwords)
        {
            maxWordLength = Math.Max(maxWordLength, word.Length);
            minWordLength = Math.Min(minWordLength, word.Length);
            for (int i = 0; i < 7 && i < word.Length; i++)
            {
                fastCheck[word[i]] |= (byte)(1 << i);
            }
            for (int i = 7; i < word.Length; i++)
            {
                fastCheck[word[i]] |= 0x80;
            }
            if (word.Length == 1)
            {
                charCheck[word[0]] = true;
            }
            else
            {
                fastLength[word[0]] |= (byte)(1 << (Math.Min(7, word.Length - 2)));
                endCheck[word[word.Length - 1]] = true;                hash.Add(word);
            }
        }
    }
    public string Filter(string text, string mask)
    {
        throw new NotImplementedException();
    }
    public bool HasBadWord(string text)
    {
        int index = 0;        while (index < text.Length)
        {
            int count = 1;            if (index > 0 || (fastCheck[text[index]] & 1) == 0)
            {
                while (index < text.Length - 1 && (fastCheck[text[++index]] & 1) == 0) ;
            }            char begin = text[index];            if (minWordLength == 1 && charCheck[begin])
            {
                return true;
            }            for (int j = 1; j <= Math.Min(maxWordLength, text.Length - index - 1); j++)
            {
                char current = text[index + j];                if ((fastCheck[current] & 1) == 0)
                {
                    ++count;
                }                if ((fastCheck[current] & (1 << Math.Min(j, 7))) == 0)
                {
                    break;
                }                if (j + 1 >= minWordLength)
                {
                    if ((fastLength[begin] & (1 << Math.Min(j - 1, 7))) > 0 && endCheck[current])
                    {
                        string sub = text.Substring(index, j + 1);                        if (hash.Contains(sub))
                        {
                            return true;
                        }
                    }
                }
            }            index += count;
        }        return false;
    }
}
http://www.cnblogs.com/xingd/archive/2008/01/31/1060425.html  .NET脏字过滤算法 http://www.cnblogs.com/xingd/archive/2008/01/23/1050443.html.NET脏字过滤算法 http://www.cnblogs.com/goody9807/archive/2006/09/12/502094.html.NET脏字过滤算法
这个只能过滤正常的敏感词组，
但是非正常的就不行，举个例子，sb吧
要过滤sb很简单，但是：s.b；s-b;煞笔，等等就不行了现在又出来了一种新的方式，就是竖状，如：胡 | 胡 |
锦 | 锦 |
涛 | 涛 |
是 | 是 |
敏 | 敏 |
感 | 感 |
词 | 词 |
组 | 组 |
对横状的过滤，通过一些积累，和一些字符的正则匹配，能搞定sb；s.b；s-b;煞笔等等
对那种竖状的就困难了。现在网易新浪什么的都做不到。
其实我感觉还是把这些脏字都放到数据库里的好，然后存储或者显示的时候replace一下搞定
测试一下csdn如何过滤脏话操你妈，fuck you 性交，性
以上只是测试，大家勿怪。