Trie树实现多模匹配算法的进一步优化

之前写过一篇关于Trie树实现多模匹配算法的文章，参见：基于Trie树的多模匹配算法的实现与优化，其实还可以进一步进行优化——在空间上进行优化。

1.进一步优化方案

上文中进行构造Trie树的过程中，每增加一个节点，就要不断的new一个新节点。这些新增加的节点都是随机的，我们不知到它的分不清况。他们通过指针联系起来。但是如果它们分布的过于分散的话，指针在寻址上面就要花费很大的时间，同时操作系统中有高速缓存机制，其速度要比内存快的多，如果指针总是跳来跳去的话，缓存的命中率就会下降。因此，为了降低寻址的开销，同时提高缓存的命中率，可以考虑用这样的办法：预先分配一个大的数组，数组的内容就是无数个节点，这样一来，这些节点就存在于大片的连续内存里面。不但能够降低寻址开销，还能大幅提高高速缓存的命中率。

将原来的构建Trie树的函数进行改进，改用一次性申请N个节点，而不是一个个的申请节点。

2.实验测试

表1 Trie树多模匹配时间

词典词长	建树时间（ms）	穿线时间（ms）	穷举匹配		KMP匹配		加速比
词典词长	建树时间（ms）	穿线时间（ms）	每字符跳转次数	匹配用时（ms）	每字符跳转次数	匹配用时（ms）	加速比
5～10	241	461	4.75	216	1	130	1.66
15～20	670	1612	5.01	551	1	206	2.67
25～30	1068	2824	5.08	923	1	288	3.20
45～50	1863	5292	5.13	1632	1	443	3.68
65～70	2481	7477	5.15	2293	1	585	3.92
75～80	2963	9061	5.16	2771	1	673	4.11

表2 Trie树多模匹配（空间优化）

词典词长	建树时间（ms）	穿线时间（ms）	穷举匹配		KMP匹配		加速比
词典词长	建树时间（ms）	穿线时间（ms）	每字符跳转次数	匹配用时（ms）	每字符跳转次数	匹配用时（ms）	加速比
5～10	107	464	4.75	206	1	104	1.98
15～20	206	1595	5.01	546	1	191	2.86
25～30	301	2783	5.08	907	1	274	3.31
45～50	487	5284	5.13	1661	1	427	3.89
65～70	750	7472	5.15	2333	1	568	4.11
75～80	1387	9178	5.16	3023	1	660	4.58

表3 Trie树多模匹配算法空间优化前后对比

词典词长	穷举匹配时间（ms）				KMP匹配时间（ms）
词典词长	优化前	优化后	效果	比率	优化前	优化后	效果	比率
5～10	216	206	10	4.63%	130	104	26	20.00%
15～20	551	546	9	1.63%	206	191	15	7.28%
25～30	923	907	16	1.73%	288	274	14	4.86%
45～50	1632	1661	-29	-1.78%	443	427	17	3.84%
65～70	2293	2333	-40	-1.74%	585	568	17	2.91%
75～80	2771	3023	-252	-9.10%	673	660	13	1.93%

其中，表1是原来的测试结果，表2是空间优化后的测试结果，表3是对这两个表进行的对比。

从表三中可以看出：

（1）对于穷举匹配来说，在词典规模比较小的时候，进行空间优化还是有效果的，但是词典规模大了以后，这种优化的效果减弱甚至效果相反。

（2）对于kmp匹配来说，同样是词典规模小的时候，优化效果明显，随着词典的扩大，这种效果减弱，应该是单词长度变长之后，Trie树的节点数目增多，相邻两层间的节点距离增加，Cache的命中率下降，因此寻址时间下降到优化前的水平。

3.源代码

代码如下：

#include <iostream>
#include <fstream>
#include <string>
#include <sys/time.h>
#include <cstring>
#include <queue>
#include <sstream>
#include <cstdlib>
#include <ctime>
#include <algorithm>

using namespace std;

const int MAX_NODE = 100000*30;		//Trie树中节点的最大数量
const int MAX = 5000000;
const int Son_Num = 26;			//每个孩子的最大分支个数
const int word_max_len = 30;		//词典中单词的最大长度

//---------------------Trie树的定义----------------------------

//节点定义
class Node{
	public:
		int flag;		//节点的是否有单词出现的标记
		int count;		//单词出现的次数
		string word;		//到达该节点时出现的单词
		vector<string> word_set;//到达该节点时能够匹配上的单词
		Node* failjump;		//匹配失败时指针的跳转目标
		Node* son[Son_Num];	//指向分支节点的指针，这里为26个
		Node* fail[Son_Num];	//指向各个分支跳转节点的指针，26个
	public:
		Node() : flag(0),count(0),word(""){
			failjump = NULL;
			memset(son, NULL, sizeof(Node*) * Son_Num);
			memset(fail, NULL, sizeof(Node*) * Son_Num);
		}
};

//trie树的操作定义
class Trie{
	private:
		Node* pArray;
		Node* pPos;
		Node* pRoot;
	private:
		void print(Node* pRoot);
	public:
		Trie();
		~Trie();
		void insert(string str);				//插入字符串
		bool search(string str, int &count);			//查询字符串
		bool remove(string str);				//删除字符串
		void destory(Node* pRoot);				//销毁Trie树
		void printAll();					//打印Trie树
		void failjump();					//穿线算法
		void multi_match_1(string &str, int begin, int end, vector<string> &result);	//多模匹配（回溯）
		void multi_match_2(string &str, int begin, int end, vector<string> &result);	//多模匹配（KMP）
};

//构造函数
Trie::Trie(){
	pArray = new Node[MAX_NODE];
	pPos = pArray;
	pRoot = pPos;

	//pRoot = new Node();
}

//析构函数
Trie::~Trie(){
	destory(pRoot);
	delete [] pArray;
	pPos == NULL;
}

//打印以root为根的Trie树
void Trie::print(Node* pRoot){
	if(pRoot != NULL){
		if(pRoot -> word != ""){
			cout << pRoot -> count << " " << pRoot -> word << endl;
		}
		if((pRoot -> word_set).size()){
			for(int i = 0; i < (pRoot -> word_set).size(); i++){
				cout << "--" << (pRoot -> word_set)[i];
			}
			cout << endl;
		}
		for(int i = 0; i < Son_Num; i++){
			print(pRoot -> son[i]);
		}
	}
}

//打印整棵树
void Trie::printAll(){
	print(pRoot);
}

//插入新的单词
void Trie::insert(string str){
	int index = 0;
	Node* pNode = pRoot;
	//不断向下搜索单词的字符是否出现
	for(int i = 0; i < str.size(); i++){
		index = str[i] - 'a';
		//字符在规定的范围内时，才进行检查
		if(index >= 0 && index < Son_Num){
			//父节点是否有指针指向本字符
			if(NULL == pNode -> son[index]){
				pPos++;
				pNode -> son[index] = pPos;

			}
			//指针向下传递
			pNode = pNode -> son[index];
		}else{
			return;
		}
	}
	//判断单词是否出现过
	if(pNode -> flag == 1){
		//单词已经出现过，计数器加1
		pNode -> count++;
		return;
	}else{
		//单词没有出现过，标记为出现，计数器加1
		pNode -> flag = 1;
		pNode -> count++;
		pNode -> word = str;
	}
}

//搜索指定单词，并返回单词出现次数（如果存在）
bool Trie::search(string str, int &count){
	Node* pNode = pRoot;
	int index = 0;
	int i = 0;
	while(pNode != NULL && i < str.size()){
		index = str[i] - 'a';
		if(index >= 0 && index < Son_Num){
			//字符在指定的范围内时
			pNode = pNode -> son[index];
			i++;
		}else{
			//字符不再指定的范围内
			return false;
		}
	}
	//判断字符串是否出现过
	if(pNode != NULL && pNode -> flag == 1){
		count = pNode -> count;
		return true;
	}else{
		return false;
	} 
}

//删除指定的单词
bool Trie::remove(string str){
	Node* pNode = pRoot;
	int i = 0;
	int index = 0;
	while(pNode != NULL && i < str.size()){
		index = str[i] - 'a';
		if(index >= 0 && index < Son_Num){
			pNode = pNode -> son[index];
			i++;
		}else{
			return false;
		}
	}
	if(pNode != NULL && pNode -> flag == 1){
		pNode -> flag = 0;
		pNode -> count = 0;
		return true;
	}else{
		return false;
	}
}

//销毁Trie树
void Trie::destory(Node* pRoot){
	if(NULL == pRoot){
		return;
	}
	for(int i = 0; i < Son_Num; i++){
		destory(pRoot -> son[i]);
	}
	pRoot == NULL;
}

//穿线算法
void Trie::failjump(){
	queue<Node*> q;
	//将根节点的孩子的failjump域和fail[]域都设置为根节点
	for(int i = 0; i < Son_Num; i++){
		if(pRoot -> son[i] != NULL){
			pRoot -> son[i] -> failjump = pRoot;
			q.push(pRoot -> son[i]);
		}else{
			pRoot -> fail[i] = pRoot;
		}	
	}
	//广度遍历树，为其他节点设置failjump域和fail[]域
	while(!q.empty()){
		Node *cur = q.front();
		q.pop();
		if(cur -> failjump != NULL){
			for(int j = 0; j < Son_Num; j++){
				//循环设置跳转域
				Node* fail = cur -> failjump;
				Node* cur_child = cur -> son[j];
				if(cur_child != NULL){
					//当孩子存在时，设置孩子的failjump
					while(fail != NULL){
						if(fail -> son[j] != NULL){
							//设置failjump域
							cur_child -> failjump = fail -> son[j];
							//设置word_set集合
							if((fail -> son[j] -> word_set).size()){
								cur_child -> word_set = fail -> son[j] -> word_set;

							}
							if(cur_child -> flag == 1){
								(cur_child -> word_set).push_back(cur_child -> word);
							}
							break;
						}else{
							fail = fail -> failjump;
						}
					}
					if(cur_child -> failjump == NULL){
						cur_child -> failjump = pRoot;
						if(cur_child -> flag == 1){
							(cur_child -> word_set).push_back(cur_child -> word);
						}
					}
					q.push(cur_child);
				}else{
					//当孩子不存在时，设置父节点fail[]域
					while(fail != NULL){
						if(fail -> son[j] != NULL){
							//设置对应的fail[j];
							cur -> fail[j] = fail -> son[j];
							break;
						}else{
							if(fail == pRoot){
								cur -> fail[j] = pRoot;
								break;
							}else{
								fail = fail -> failjump;
							}
						}
					}
				}
			}
		}
	}
}

//多模匹配算法1（穷举匹配）
void Trie::multi_match_1(string &str, int begin, int end, vector<string> &result){
	int count = 0;
	for(int pos = 0; pos < end; pos++){
		Node* pNode = pRoot;
		int index = 0;
		int i = pos;
		while(pNode != NULL && i < pos + word_max_len && i < end){
			index = str[i] - 'a';
			if(index >= 0 && index < Son_Num){
				//字符在指定的范围内时
				pNode = pNode -> son[index];
				count++;
				i++;
				//若字符串出现过，输出
				if(pNode != NULL && pNode -> flag == 1){
					//cout << pNode -> word << endl;
					result.push_back(pNode -> word);
				}
			}
		}
	}
	cout << "  跳转次数：count = " << count << endl;
	cout << "  每个字符平均跳转次数：" << double(count)/(end - begin) << endl;
}

//多模匹配算法2（类KMP匹配）
void Trie::multi_match_2(string &str, int begin, int end, vector<string> &result){
	int count_1 = 0, count_2 = 0, count_3 = 0;
	Node* pNode = pRoot;
	int index = 0;
	int i = begin;
	int word_set_size = 0;
	while(pNode != NULL && i < end){
		index = str[i] - 'a';
		if(index >= 0 && index < Son_Num){
			//字符在指定的范围内时
			if(pNode -> son[index]){
				//该孩子存在，继续向下搜索
				pNode = pNode -> son[index];
				count_1++;
			}else if(pNode != pRoot){
				//该孩子不存在，且当前不是根节点，进行跳转
				pNode = pNode -> fail[index];
				count_2++;
			}else{
				//该孩子不存在，且当前已经是跟节点，目标字符串前移
				count_3++;

			}

			//若该位置有匹配词语，输出
			if(word_set_size = (pNode -> word_set).size()){
				for(int i = 0; i < word_set_size; i++){
					//cout << (pNode -> word_set)[i] << endl;
					result.push_back((pNode -> word_set)[i]);
				}
			}
			//目标串前移
			i++;
		}
	}
	cout << "  跳转次数：count_1 = " << count_1 << "  count_2 = " << count_2 << "  count_3 = " << count_3 << endl;
}

//----------------------辅助代码----------------------------

//生成随机字符串
string rand_string(int min, int max){
	char a[MAX+1];
	int len = rand()%(max - min) + min;
	for(int i = 0; i < len; i++){
		a[i] = rand()%26 + 'a';
	}
	a[len] = '\0';
	string str(a);
	return str;
}

//-----------------测试代码---------------------------
//获取当前时间（ms）
long getCurrentTime(){
	struct timeval tv;
	gettimeofday(&tv, NULL);
	return tv.tv_sec*1000 + tv.tv_usec/1000;
}


//树的基本操作，增删查改测试
void Test_1(){

	//1.建立对象
	Trie trie;
	string str;
	ifstream fin;
	fin.open("dict.txt");

	//2.建立Trie树
	long time_1 = getCurrentTime();
	while(getline(fin, str, '\n')){
		trie.insert(str);
	}
	long time_2 = getCurrentTime();
	fin.close();

	//3.查找单词
	str = "siubetamwm";
	int count = -1;
	long time_3 = getCurrentTime();
	bool isFind = trie.search(str, count);
	cout << "要查找的字符串：" << str << endl; 
	long time_4 = getCurrentTime();
	if(isFind){
		cout << "  该单词存在，存在次数：" << count << endl;
	}else{
		cout << "  该单词不存在！" << endl;
	}

	//4.删除单词
	str = "lutgjrxjavgfkij";
	cout << "要删除的字符串：" << str << endl; 
	bool isRemove = trie.remove(str);	
	if(isRemove){
		cout << "  该单词存在，删除成功！" << endl;
	}else{
		cout << "  该单词不存在，删除失败！" << endl;
	}

}

//调用词典，多模匹配测试
void Test_2(){

	//1.建立对象
	Trie trie;
	string str;
	ifstream fin;
	fin.open("dict.txt");

	//2.建立Trie树
	long time_1 = getCurrentTime();
	while(getline(fin, str, '\n')){
		trie.insert(str);
	}
	fin.close();
	long time_2 = getCurrentTime();

	//3.将短字符串组合为长字符串
	string long_str;
	ostringstream string_out;
	fin.open("dict.txt");
	while(getline(fin, str, '\n')){
		string_out << str;
	}
	fin.close();
	long_str = string_out.str();

	//long_str = rand_string(MAX - 20, MAX);

	vector<string> result_1;
	vector<string> result_2;	

	//4.在长字符串中搜索字典中的字符串（方法一：穷举）
	cout << "穷举匹配：" << endl;
	long time_3 = getCurrentTime();
	trie.multi_match_1(long_str, 0, long_str.size(), result_1);
	long time_4 = getCurrentTime();
	cout << "穷举匹配完毕！" << endl;

	//5.进行穿线处理
	cout << "穿线处理" << endl;
	long time_5 = getCurrentTime();
	trie.failjump();
	long time_6 = getCurrentTime();
	cout << "穿线完毕" << endl;

	//6.在长字符串中搜索字典中的字符串（方法二）
	cout << "KMP匹配：" << endl;
	long time_7 = getCurrentTime();
	trie.multi_match_2(long_str, 0, long_str.size(), result_2);
	long time_8 = getCurrentTime();
	cout << "KMP匹配完毕" << endl;

	//7.输出结果
	cout << "目标字符串长度：" << long_str.size() << endl;
	cout << "穷举匹配结果数量：" << result_1.size() << endl;
	cout << "KMP匹配结果数量：" <<result_2.size() << endl;
	sort(result_1.begin(), result_1.end());
	sort(result_2.begin(), result_2.end());

	if(result_1 == result_2){
		cout << "两种多模匹配方式结果相符：True" << endl;
	}else{
		cout << "两种多模匹配方式结果不相符：False" << endl;
	}
	cout << endl;
	cout << "建立Trie树用时（ms）：" << time_2 - time_1 << endl;
	cout << "穿线算法用时（ms）：  " << time_6 - time_5 << endl;
	cout << "多模匹配1用时（ms）：" << time_4 - time_3 << endl;
	cout << "多模匹配2用时（ms）：" << time_8 - time_7 << endl;
	cout << "加速比：" << double(time_4 - time_3)/(time_8 - time_7) << endl;

}

//小测试
void Test_3(){

	//1.建立对象
	Trie trie;

	//2.建立Trie树
	string str_1 = "uuidi";
	string str_2 = "ui";
	string str_3 = "idi";
	string str_4 = "idk";
	string str_5 = "di";

	trie.insert(str_1);
	trie.insert(str_2);
	trie.insert(str_3);
	trie.insert(str_4);
	trie.insert(str_5);

	//3.构造待匹配的字符串
	string long_str = rand_string(MAX - 20, MAX);
	//string long_str = "uuidiuiidiidkdi";
	cout << "length: " << long_str.size() << endl;
	vector<string> result_1;
	vector<string> result_2;	

	//4.在长字符串中搜索字典中的字符串（方法一：穷举）
	cout << "穷举匹配：" << endl;
	long time_5 = getCurrentTime();
	trie.multi_match_1(long_str, 0, long_str.size(), result_1);
	long time_6 = getCurrentTime();
	cout << "穷举匹配完毕！" << endl;

	//5.进行穿线处理
	cout << "穿线处理" << endl;
	trie.failjump();
	cout << "穿线完毕" << endl;

	//6.在长字符串中搜索字典中的字符串（方法二）
	cout << "KMP匹配：" << endl;
	long time_7 = getCurrentTime();
	trie.multi_match_2(long_str, 0, long_str.size(), result_2);
	long time_8 = getCurrentTime();
	cout << "KMP匹配完毕" << endl;

	//7.输出结果
	cout << result_1.size() << endl;
	cout << result_2.size() << endl;
	sort(result_1.begin(), result_1.end());
	sort(result_2.begin(), result_2.end());

	cout << "目标字符传长度：" << long_str.size() << endl;
	if(result_1 == result_2){
		cout << "两种多模匹配方式结果相符：True" << endl;
	}else{
		cout << "两种多模匹配方式结果不相符：False" << endl;
	}
	cout << "多模匹配1用时（ms）：" << time_6 - time_5 << endl;
	cout << "多模匹配2用时（ms）：" << time_8 - time_7 << endl;

}

int main(){

	//Test_1();
	Test_2();
	//Test_3();

}

秒客网

Trie树实现多模匹配算法的进一步优化

相关文章