本文转载自微信公众号《UP科技控》,作者柯南。转载本文请联系UP技控公众号。敏感词和文本过滤是网站必不可少的功能,因此如何设计一个好的、高效的过滤算法是非常必要的。在文本过滤的算法中,DFA是唯一实现较好的算法。DFA即DeterministicFiniteAutomaton,即确定有限自动机。它通过事件和当前状态得到下一个状态,即event+state=nextstate。在过滤敏感词的算法中,我们要减少计算量,而DFA在DFA算法中几乎没有计算量,有的只是状态转移。下面看c#方法下的实现方法1.建立一个敏感词库类privateboolLoadDictionary(){varwordList=newList();if(_memoryLexicon==null){_memoryLexicon=newWordGroup[char.MaxValue];varwords=newSensitiveWordBll().GetAllWords();如果(单词==null)返回假;foreach(stringwordinwords){wordList.Add(word);if(word!=chineseWord)wordList.Add(chineseWord);}foreach(varwordinwordList){if(word.Length>0){vargroup=_memoryLexicon[word[0]];if(group==null){group=newWordGroup();_memoryLexicon[word[0]]=group;}group.Add(word.Substring(1));}}}returntrue;}2.构建敏感词检测类privateboolCheck(stringblackWord){_wordlength=0;//Detectthenextcursorofthesource_nextCursor=_cursor+1;varfound=false;varcontinueCheck=0;//遍历单词的每一位for(vari=0;i=_sourceText.Length){if(i-1=stringlength跳出if(_nextCursor+offset>=_sourceText.Length)break;_wordlength++;}elsebreak;}if(_nextCursor+offset>=_sourceText.Length){found=false;break;}if(blackWord[i]==_sourceText[_nextCursor+offset]){found=true;continueCheck=0;}else{//如果没有找到则尝试继续匹配4个字符if(continueCheck<4&&_nextCursor<_sourceText.Length-1){continueCheck++;i--;}else{found=false;break;}}}_nextCursor=_nextCursor+1+offset;_wordlenght++;}returnfound;}}3.测试使用method_illegalWords=newList();if(string.IsNullOrEmpty(sourceText)&&string.IsNullOrEmpty(_sourceText)){returnsourceText;}if(!string.IsNullOrEmpty(sourceText))_sourceText=sourceText;_cursor=0;if(!LoadDictionary()){return_sourceText;}vartempString=_sourceText.ToCharArray();varsourceTextDbc=ToDBC(SourceText);for(vari=0;i0)ssx=ssx.Replace(word,"*".PadLeft(word.Length,'*'));}vardatetime2x=DateTime.Now;varmillisecondx=(datetime2x-datetimex).TotalMilliseconds;Console.WriteLine(millisecondx);控制台。写行(ssx);