1. 程式人生 > >C#敏感詞過濾演算法實現

C#敏感詞過濾演算法實現

1.DFA演算法簡介

DFA全稱為:Deterministic Finite Automaton,即確定有窮自動機。其特徵為:有一個有限狀態集合和一些從一個狀態通向另一個狀態的邊,每條邊上標記有一個符號,其中一個狀態是初態,某些狀態是終態。但不同於不確定的有限自動機,DFA中不會有從同一狀態出發的兩條邊標誌有相同的符號。

 

簡單點說就是,它是是通過event和當前的state得到下一個state,即event+state=nextstate。理解為系統中有多個節點,通過傳遞進入的event,來確定走哪個路由至另一個節點,而節點是有限的。

2.實現程式碼如下:

新建一個FilterHelper.cs類,放敏感詞的過濾統一處理方法

#region 非法關鍵字過濾 bate 1.1
    /// <summary>
    /// 非法關鍵詞過濾(自動忽略漢字數字字母間的其他字元)
    /// </summary>
    public class FilterHelper
    {

        public FilterHelper() { }

        public FilterHelper(string dictionaryPath)
        {
            this.dictionaryPath = dictionaryPath;
        }

        private string dictionaryPath = string.Empty;
        /// <summary>
        /// 詞庫路徑
        /// </summary>
        public string DictionaryPath
        {
            get { return dictionaryPath; }
            set { dictionaryPath = value; }
        }
        /// <summary>
        /// 記憶體詞典
        /// </summary>
        private WordGroup[] MEMORYLEXICON = new WordGroup[(int)char.MaxValue];

        private string sourctText = string.Empty;
        /// <summary>
        /// 檢測源
        /// </summary>
        public string SourctText
        {
            get { return sourctText; }
            set { sourctText = value; }
        }

        /// <summary>
        /// 檢測源遊標
        /// </summary>
        int cursor = 0;

        /// <summary>
        /// 匹配成功後偏移量
        /// </summary>
        int wordlenght = 0;

        /// <summary>
        /// 檢測詞遊標
        /// </summary>
        int nextCursor = 0;


        private List<string> illegalWords = new List<string>();

        /// <summary>
        /// 檢測到的非法詞集
        /// </summary>
        public List<string> IllegalWords
        {
            get { return illegalWords; }
        }

        /// <summary>
        /// 判斷是否是中文
        /// </summary>
        /// <param name="character"></param>
        /// <returns></returns>
        private bool isCHS(char character)
        {
            //  中文表意字元的範圍 4E00-9FA5
            int charVal = (int)character;
            return (charVal >= 0x4e00 && charVal <= 0x9fa5);
        }

        /// <summary>
        /// 判斷是否是數字
        /// </summary>
        /// <param name="character"></param>
        /// <returns></returns>
        private bool isNum(char character)
        {
            int charVal = (int)character;
            return (charVal >= 48 && charVal <= 57);
        }

        /// <summary>
        /// 判斷是否是字母
        /// </summary>
        /// <param name="character"></param>
        /// <returns></returns>
        private bool isAlphabet(char character)
        {
            int charVal = (int)character;
            return ((charVal >= 97 && charVal <= 122) || (charVal >= 65 && charVal <= 90));
        }


        /// <summary>
        /// 轉半形小寫的函式(DBC case)
        /// </summary>
        /// <param name="input">任意字串</param>
        /// <returns>半形字串</returns>
        ///<remarks>
        ///全形空格為12288,半形空格為32
        ///其他字元半形(33-126)與全形(65281-65374)的對應關係是:均相差65248
        ///</remarks>
        private string ToDBC(string input)
        {
            char[] c = input.ToCharArray();
            for (int i = 0; i < c.Length; i++)
            {
                if (c[i] == 12288)
                {
                    c[i] = (char)32;
                    continue;
                }
                if (c[i] > 65280 && c[i] < 65375)
                    c[i] = (char)(c[i] - 65248);
            }
            return new string(c).ToLower();
        }

        /// <summary>
        /// 載入記憶體詞庫
        /// </summary>
        private void LoadDictionary()
        {
            if (DictionaryPath != string.Empty)
            {
                List<string> wordList = new List<string>();
                Array.Clear(MEMORYLEXICON, 0, MEMORYLEXICON.Length);
                string[] words = System.IO.File.ReadAllLines(DictionaryPath, System.Text.Encoding.Default);
                foreach (string word in words)
                {
                    string key = this.ToDBC(word);
                    wordList.Add(key);
                    wordList.Add(Microsoft.VisualBasic.Strings.StrConv(key, Microsoft.VisualBasic.VbStrConv.TraditionalChinese, 0));
                }
                Comparison<string> cmp = delegate(string key1, string key2)
                {
                    return key1.CompareTo(key2);
                };
                wordList.Sort(cmp);
                for (int i = wordList.Count - 1; i > 0; i--)
                {
                    if (wordList[i].ToString() == wordList[i - 1].ToString())
                    {
                        wordList.RemoveAt(i);
                    }
                }
                foreach (var word in wordList)
                {
                    if (word.Length>0)
                    {
                        WordGroup group = MEMORYLEXICON[(int)word[0]];
                        if (group == null)
                        {
                            group = new WordGroup();
                            MEMORYLEXICON[(int)word[0]] = group;

                        }
                        group.Add(word.Substring(1));
                    }
                }
            }

        }

        /// <summary>
        /// 檢測
        /// </summary>
        /// <param name="blackWord"></param>
        /// <returns></returns>
        private bool Check(string blackWord)
        {
            wordlenght = 0;
            //檢測源下一位遊標
            nextCursor = cursor + 1;
            bool found = false;
            //遍歷詞的每一位做匹配
            for (int i = 0; i < blackWord.Length; i++)
            {
                //特殊字元偏移遊標
                int offset = 0;
                if (nextCursor >= sourctText.Length)
                {
                    break;
                }
                else
                {
                    //檢測下位字元如果不是漢字 數字 字元 偏移量加1
                    for (int y = nextCursor; y < sourctText.Length; y++)
                    {

                        if (!isCHS(sourctText[y]) && !isNum(sourctText[y]) && !isAlphabet(sourctText[y]))
                        {
                            offset++;
                            //避讓特殊字元,下位遊標如果>=字串長度 跳出
                            if (nextCursor + offset >= sourctText.Length) break;
                            wordlenght++;

                        }
                        else break;
                    }

                    if ((int)blackWord[i] == (int)sourctText[nextCursor + offset])
                    {
                        found = true;
                    }
                    else
                    {
                        found = false;
                        break;
                    }


                }
                nextCursor = nextCursor + 1 + offset;
                wordlenght++;


            }
            return found;
        }

        /// <summary>
        /// 查詢並替換
        /// </summary>
        /// <param name="replaceChar"></param>
        public string Filter(char replaceChar)
        {
            LoadDictionary();
            if (sourctText != string.Empty)
            {
                char[] tempString = sourctText.ToCharArray();
                for (int i = 0; i < SourctText.Length; i++)
                {
                    //查詢以該字為首字元的片語
                    WordGroup group = MEMORYLEXICON[(int)ToDBC(SourctText)[i]];
                    if (group != null)
                    {
                        for (int z = 0; z < group.Count(); z++)
                        {
                            string word = group.GetWord(z);
                            if (word.Length == 0 || Check(word))
                            {
                                string blackword = string.Empty;
                                for (int pos = 0; pos < wordlenght + 1; pos++)
                                {
                                    blackword += tempString[pos + cursor].ToString();
                                    tempString[pos + cursor] = replaceChar;
                                }
                                illegalWords.Add(blackword);
                                cursor = cursor + wordlenght;
                                i = i + wordlenght;

                            }
                        }
                    }
                    cursor++;
                }
                return new string(tempString);
            }
            else
            {
                return string.Empty;
            }

        }
    }
    /// <summary>
    /// 具有相同首字元的片語集合
    /// </summary>
    class WordGroup
    {
        /// <summary>
        /// 集合
        /// </summary>
        private List<string> groupList;

        public WordGroup()
        {
            groupList = new List<string>();
        }

        /// <summary>
        /// 新增詞
        /// </summary>
        /// <param name="word"></param>
        public void Add(string word)
        {
            groupList.Add(word);
        }

        /// <summary>
        /// 獲取總數
        /// </summary>
        /// <returns></returns>
        public int Count()
        {
            return groupList.Count;
        }

        /// <summary>
        /// 根據下標獲取詞
        /// </summary>
        /// <param name="index"></param>
        /// <returns></returns>
        public string GetWord(int index)
        {
            return groupList[index];
        }
    }

    #endregion

3.相應敏感詞驗證的地方呼叫程式碼

  FilterHelper filter = new FilterHelper(HttpContext.Current.Server.MapPath("~/xmlconfig/badword.txt"));   //存放敏感詞的文件
  filter.SourctText = this.mainProduct.Value.Trim();   
  string resultStr = filter.Filter('*');