博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
java 实现DFA 算法(理论百度搜索)
阅读量:5925 次
发布时间:2019-06-19

本文共 8023 字,大约阅读时间需要 26 分钟。

DFA简介

DFA全称为:Deterministic Finite Automaton,即确定有穷自动机。(自己百度吧)


直接代码:

敏感词实体类

package com.nopsmile.dfa;public class Keywords {    private String pid;    private String Content;    public Keywords() {    }    public Keywords(String content) {        super();        Content = content;    }    public String getContent() {        return Content;    }    public void setContent(String content) {        Content = content;    }    public String getPid() {        return pid;    }    public void setPid(String pid) {        this.pid = pid;    }}

敏感词库初始化

package com.nopsmile.dfa;import java.util.HashMap;import java.util.HashSet;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.Set;/** * 敏感词库初始化 *  */public class SensitiveWordInit{    /**     * 敏感词库     */    public HashMap sensitiveWordMap;    /**     * 初始化敏感词 keywords     */   public Map initKeyWord(List
sensitiveWords) { try { // 从敏感词集合对象中取出敏感词并封装到Set集合中 Set
keyWordSet = new HashSet
(); for (Keywords s : sensitiveWords) { keyWordSet.add(s.getContent().trim()); } // 将敏感词库加入到HashMap中 addSensitiveWordToHashMap(keyWordSet); } catch (Exception e) { e.printStackTrace(); } return sensitiveWordMap; } /** * 封装敏感词库 */ private void addSensitiveWordToHashMap(Set
keyWordSet) { // 初始化HashMap对象并控制容器的大小 sensitiveWordMap = new HashMap(keyWordSet.size()); // 敏感词 String key = null; // 用来按照相应的格式保存敏感词库数据 Map nowMap = null; // 用来辅助构建敏感词库 Map
newWorMap = null; // 使用一个迭代器来循环敏感词集合 Iterator
iterator = keyWordSet.iterator(); while (iterator.hasNext()) { key = iterator.next(); // 等于敏感词库,HashMap对象在内存中占用的是同一个地址,所以此nowMap对象的变化,sensitiveWordMap对象也会跟着改变 nowMap = sensitiveWordMap; for (int i = 0; i < key.length(); i++) { // 截取敏感词当中的字,在敏感词库中字为HashMap对象的Key键值 char keyChar = key.charAt(i); // 判断这个字是否存在于敏感词库中 Object wordMap = nowMap.get(keyChar); if (wordMap != null) { nowMap = (Map) wordMap; } else { newWorMap = new HashMap
(); newWorMap.put("isEnd", "0"); nowMap.put(keyChar, newWorMap); nowMap = newWorMap; } // 如果该字是当前敏感词的最后一个字,则标识为结尾字 if (i == key.length() - 1) { nowMap.put("isEnd", "1"); } } } }}

自定义的工具类

package com.nopsmile.dfa;import java.util.ArrayList;import java.util.Collections;import java.util.Comparator;import java.util.HashMap;import java.util.HashSet;import java.util.Iterator;import java.util.LinkedHashMap;import java.util.LinkedList;import java.util.List;import java.util.Map;import java.util.Set;import com.alibaba.fastjson.JSONArray;import net.sf.json.JSONObject;/** * 敏感词过滤工具类 *  * @author AlanLee * */public class SensitivewordUtils {    /**     * 敏感词库     */    public static Map sensitiveWordMap = null;    /**     * 只过滤最小敏感词     */    public static int minMatchTYpe = 1;    /**     * 过滤所有敏感词     */    public static int maxMatchType = 2;    /**     * 敏感词库敏感词数量     *     * @return     */    public static int getWordSize() {        if (SensitivewordUtils.sensitiveWordMap == null) {            return 0;        }        return SensitivewordUtils.sensitiveWordMap.size();    }    /**     * 是否包含敏感词     *     */    public static boolean isContaintSensitiveWord(String txt, int matchType) {        boolean flag = false;        for (int i = 0; i < txt.length(); i++) {            int matchFlag = checkSensitiveWord(txt, i, matchType);            if (matchFlag > 0) {                flag = true;            }        }        return flag;    }    /**     * 获取敏感词内容     *     * @param txt     * @param matchType     * @return 敏感词内容     */    public static Set
getSensitiveWord(String txt, int matchType) { Set
sensitiveWordList = new HashSet
(); for (int i = 0; i < txt.length(); i++) { int length = checkSensitiveWord(txt, i, matchType); if (length > 0) { // 将检测出的敏感词保存到集合中 sensitiveWordList.add(txt.substring(i, i + length)); i = i + length - 1; } } return sensitiveWordList; } /** * 替换敏感词 * */ public static String replaceSensitiveWord(String txt, int matchType, String replaceChar) { String resultTxt = txt; Set
set = getSensitiveWord(txt, matchType); Iterator
iterator = set.iterator(); String word = null; String replaceString = null; while (iterator.hasNext()) { word = iterator.next(); replaceString = getReplaceChars(replaceChar, word.length()); resultTxt = resultTxt.replaceAll(word, replaceString); } return resultTxt; } /** * 替换敏感词内容 * */ private static String getReplaceChars(String replaceChar, int length) { String resultReplace = replaceChar; for (int i = 1; i < length; i++) { resultReplace += replaceChar; } return resultReplace; } /** * 检查敏感词数量 * */ public static int checkSensitiveWord(String txt, int beginIndex, int matchType) { boolean flag = false; // 记录敏感词数量 int matchFlag = 0; char word = 0; Map nowMap = SensitivewordUtils.sensitiveWordMap; for (int i = beginIndex; i < txt.length(); i++) { word = txt.charAt(i); // 判断该字是否存在于敏感词库中 nowMap = (Map) nowMap.get(word); if (nowMap != null) { matchFlag++; // 判断是否是敏感词的结尾字,如果是结尾字则判断是否继续检测 if ("1".equals(nowMap.get("isEnd"))) { flag = true; // 判断过滤类型,如果是小过滤则跳出循环,否则继续循环 if (SensitivewordUtils.minMatchTYpe == matchType) { break; } } } else { break; } } if (!flag) { matchFlag = 0; } return matchFlag; } /** * 敏感词汇对应个数 * 返回 "关键字"="关键字个数" * */ public static Map getSensitiveWordSum(String txt, int matchType) { Map
map = new HashMap
(); for (int i = 0; i < txt.length(); i++) { int length = checkSensitiveWord(txt, i, matchType); if (length > 0) { // 将检测出的敏感词保存到集合中 String str=txt.substring(i, i + length); if(map.containsKey(str)) { map.put(str, map.get(str).intValue()+1); }else { map.put(str, new Integer(1)); } //System.out.println(txt.substring(i, i + length)); i = i + length - 1; } } return map; } /** * 对map数组value排序,并取前10 * this method will always sort the map; * isCondition is true condition can be used otherwise invalid * @param unsortMap * @return */ public static Map
sortByValue(Map
unsortMap,int condition,boolean isCondition) { // 1. Convert Map to List of Map List
> list = new LinkedList
>(unsortMap.entrySet()); // 2. Sort list with Collections.sort(), provide a custom Comparator // Try switch the o1 o2 position for a different order Collections.sort(list, new Comparator
>() { public int compare(Map.Entry
o1, Map.Entry
o2) { return (o2.getValue()).compareTo(o1.getValue()); } }); // 3. Loop the sorted list and put it into a new insertion order Map LinkedHashMap Map
sortedMap = new LinkedHashMap
(); if(isCondition) { for (int i = 0; i < list.size(); i++) { if (i < condition) { sortedMap.put(list.get(i).getKey(), list.get(i).getValue()); } } }else{ for (int i = 0; i < list.size(); i++) { sortedMap.put(list.get(i).getKey(), list.get(i).getValue()); } } return sortedMap; }}

使用上面类流程代码

Keywords ss=new Keywords("好");List list = new ArrayList();list.add(ss);SensitiveWordInit sensitiveWordInit = new SensitiveWordInit();Map sensitiveWordMap   = sensitiveWordInit.initKeyWord(list);// 传入SensitivewordEngine类中的敏感词库SensitivewordUtils.sensitiveWordMap = sensitiveWordMap;SensitivewordUtils.getSensitiveWordSum("需要检测的文本", 2)    ;

转载于:https://blog.51cto.com/4534309/2391498

你可能感兴趣的文章
让Visual Studio 2013为你自动生成XML反序列化的类
查看>>
[转载]C/C++框架和库
查看>>
Cygwin使用指南
查看>>
OPC Client “failed to execute OPCENUM” 解决方法
查看>>
教你创建Google网站地图Sitemap.xml(转)
查看>>
Oracle Grid 11.2.0.4 安装是出现"INS-30510: Insufficient number of ASM disks selected."
查看>>
纤程(FIBER)
查看>>
Maven Learning - Direct Dependencies & Transitive Dependencies
查看>>
远程访问CENTOS的MYSQL数据库设置
查看>>
煮茶社区AVR开发板第二版[转]
查看>>
Python中*args 和**kwargs的用法
查看>>
袁永福软件行业从业经历
查看>>
循序渐进DB2(第2版)——DBA系统管理、运维与应用案例
查看>>
BZOJ 3434 时空穿梭
查看>>
网游的服务器瓶颈
查看>>
win32的一个售票程序,收获有非常的多
查看>>
erlang 编译之 to_core
查看>>
做移动互联网App,你的测试用例足够吗?
查看>>
Perl的第二纪
查看>>
在Android应用中使用Pull解析XML文件(传智播客视频笔记)
查看>>