DFA敏感词过滤

时间:2022-10-12 15:25:39
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang3.StringUtils;
import org.json.JSONException;
import org.json.JSONObject;

public class DFA {
/**
* 根节点
*/
private static TreeNode rootNode = new TreeNode();

/**
* 创建关键词树时的临时根节点
*/
private static TreeNode rootNodeTemp = new TreeNode();

/**
* 关键词缓存
*/
private static ByteBuffer keywordBuffer = ByteBuffer.allocate(1024);

/**
* 关键词编码
*/
private static String charset = "utf-8";

/**
* 关键词树是否初始化
*/
private static boolean treeIsInit = false;

/**
* 创建DFA树
*
@param keywordList
*
@throws UnsupportedEncodingException
*/
public static void createKeywordTree(List<String> keywordList) throws UnsupportedEncodingException{
for (String keyword : keywordList) {
if(keyword == null) continue;
keyword
= keyword.trim().toLowerCase();
byte[] bytes = keyword.getBytes(charset);
TreeNode tempNode
= rootNodeTemp;
for (int i = 0; i < bytes.length; i++) {
int index = bytes[i] & 0xff;
TreeNode node
= tempNode.getSubNode(index);
if(node == null){
node
= new TreeNode();
tempNode.setSubNode(index, node);
}
tempNode
= node;
if(i == bytes.length - 1){
tempNode.setKeywordEnd(
true);
}
}
}
synchronized (DFA.class)
{
rootNode
= rootNodeTemp;
if(!treeIsInit)treeIsInit = true;
}

}

/**
* 检索关键词
*
@param text
*
@return
*
@throws UnsupportedEncodingException
*/
public static JSONObject searchKeyword(String text) throws UnsupportedEncodingException{
if(StringUtils.isEmpty(text)){
JSONObject json
= new JSONObject();
try
{
json.put(
"islegal", "1");
json.put(
"deststr", "");
}
catch (JSONException e)
{
e.printStackTrace();
}
return json;
}
if(!treeIsInit)createKeywordTree();
System.out.println(
"searchKeyword:"+text);
return searchKeyword(text.getBytes(charset),text.toLowerCase().getBytes(charset));
}

/**
* 检索关键词
*
@param oldBytes
*
@param bytes
*
@return
*/
public static JSONObject searchKeyword(byte[] oldBytes,byte[] bytes){
String is_legal
= "1";
JSONObject json
= new JSONObject();
String result
= null;
if(bytes == null || bytes.length == 0){
result
= "";
}
else{
synchronized (DFA.class)
{
TreeNode tempNode
= rootNode;
int rollback = 0;
int position = 0;
while (position < bytes.length) {
int index = bytes[position] & 0xFF;
keywordBuffer.put(bytes[position]);
tempNode
= tempNode.getSubNode(index);
if(tempNode == null){
position
= position - rollback;
rollback
= 0;
tempNode
= rootNode;
keywordBuffer.clear();
}
else if(tempNode.isKeywordEnd()){
keywordBuffer.flip();
for (int i = 0; i <= rollback; i++) {
bytes[position
-i] = 42;
oldBytes[position
-i] = 42;
}
keywordBuffer.limit(keywordBuffer.capacity());
rollback
= 1;
is_legal
= "0";
}
else{
rollback
++;
}
position
++;
}
}
try {
result
= new String(oldBytes,charset);

}
catch (Exception e) {
e.printStackTrace();
}

}
try
{
json.put(
"islegal", is_legal);
json.put(
"deststr", result);
}
catch (JSONException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
return json;
}

public void setCharset(String charset) {
this.charset = charset;
}

public static void main(String[] args){
try{
List
<String> keys = new ArrayList<String>();
keys.add(
"TMD");
keys.add(
"변형");
String text
="你真TMD是个변형啊!";
DFA.createKeywordTree(keys);
JSONObject a
= DFA.searchKeyword(text);
System.out.println(a);
}
catch(Exception ex){
ex.printStackTrace();
}
}
}

 

 

import java.util.ArrayList;
import java.util.List;

public class TreeNode {
private static final int NODE_LEN = 256;

/**
* true 关键词的终结 ; false 继续
*/
private boolean end = false;

private List<TreeNode> subNodes = new ArrayList<TreeNode>(NODE_LEN);

public TreeNode(){
for (int i = 0; i < NODE_LEN; i++) {
subNodes.add(i,
null);
}
}

/**
* 向指定位置添加节点树
*
@param index
*
@param node
*/
public void setSubNode(int index, TreeNode node){
subNodes.set(index, node);
}

public TreeNode getSubNode(int index){
//System.out.println("index:"+index+" node:"+this+"--subNodes:"+subNodes.get(index));
return subNodes.get(index);
}


public boolean isKeywordEnd() {
return end;
}

public void setKeywordEnd(boolean end) {
this.end = end;
}
}