C++里创建 Trie字典树（中文词典）（二）（插入、查找、导入、导出）

　　萌新做词典第二篇，做得不好，还请指正，谢谢大佬！

　　做好了插入与遍历功能之后，我发现最基本的查找功能没有实现，同时还希望能够把内存的数据存入文件保存下来，并可以从文件中导入词典。此外，数据的路径是存在配置文件中的。甚至，还想尝试类似自动补全的功能。当然了，是做一个比较low的补全，比如传入“编程”，能够得到”软件“、”学习“、”学习网站“、”入门“四个字符串。但是传入“编”不会得到“程”，因为“编程”没有录入词典。于是对原有的类进行了修改，并添加了一些函数。

　　先放上运行结果吧：

 test1

 ID :     char : 件    word : 编程软件

 ID :     char : 习    word : 编程学习

 ID :     char : 站    word : 编程学习网站

 ID :     char : 门    word : 编程入门

 test2

 ID :     char : 门    word : 编程入门

 ID :     char : 习    word : 编程学习

 ID :     char : 站    word : 编程学习网站

 ID :     char : 件    word : 编程软件

 find ID :     word : 编程学习

　　test1就是上一步里，把word的id打印了出来，并导出到文件中。

　　test2是从文件中导入，然后打出结果，并查找“编程学习”，如果存在就打出id。

　　首先，新增了一个配置文件类DictionaryConf，如下：

DictionaryConf.h

 #ifndef __DICTIONARYCONF_H__

 #define __DICTIONARYCONF_H__

 #include <string>

 namespace ccx{

 using std::string;

 class DictionaryConf

 {

     public:

         DictionaryConf();

         string getDictionaryPath();

     private:

         void GetConf();

         string _DictionaryPath;

 };

 }

 #endif

　　它的功能很简单，在构造的时候读取文件配置信息，对外只提供一个接口，就是获取一个字符串。它的实现如下：

DictionaryConf.cc

 #include "DictionaryConf.h"

 #include <json/json.h>

 #include <stdlib.h>

 #include <fstream>

 #include <iostream>

 namespace ccx{

 using std::ifstream;

 using std::cout;

 using std::endl;

 DictionaryConf::DictionaryConf()

 {

     GetConf();

 }

 void DictionaryConf::GetConf()

 {

     ifstream ifs;

     ifs.open("DictionaryConf.json");

     if(!ifs.good())

     {

         cout << "open DictionaryConf.json error" << endl;

         exit(EXIT_FAILURE);

     }

     Json::Value root;

     Json::Reader reader;

     if(!reader.parse(ifs, root, false))

     {

         cout << "DictionaryConf json read error" << endl;

         exit(EXIT_FAILURE);

     }

     _DictionaryPath = root["DictionaryPath"].asString();

     ifs.close();

 }

 string DictionaryConf::getDictionaryPath()

 {

     return _DictionaryPath;

 }

 }

　　配置文件是写成json格式的，方便读取：

DictionaryConf.json

 {"DictionaryPath" : "Dictionary.json"}

　　然后是修改已有的Dictionary类，增加它的功能（总觉得是不是要变胖接口了0。0）

Dictionary.h

 #ifndef __DICTIONARY_H__

 #define __DICTIONARY_H__

 #include "DictionaryData.h"

 #include "DictionaryConf.h"

 #include <memory>

 #include <vector>

 #include <list>

 namespace ccx{

 using std::shared_ptr;

 using std::vector;

 using std::list;

 class Dictionary

 {

     typedef unordered_map<string, pDictElem>::iterator WordIt;

     public:

         Dictionary();

         void push(const string & word);

         void push(vector<string> & words);

         int search(const string & word);//查找，返回词的ID，如果没找到返回-1

     private:

         void AddWord(const string & word, int wordId);

         void splitWord(const string & word, vector<string> & characters);//把词拆成字

         pDictElem _dictionary;

         DictionaryConf _conf;    

 //遍历用

     public:

         void next();

         string getCurChar();

         string getCurWord();

         int getCurWordId();

         void resetPoint();

         bool isEnd();

     private:

         void nextWord();

         pDictElem _pcur;

         WordIt _itcur;

 //用list实现栈，遍历时方便

         list<WordIt> _stackWord;

         list<pDictElem> _stackDict;

 //导入导出

     public:

         void leading_in();

         void leading_out();

 };

 }

 #endif

　　这又是一个最终的类。比之前的多了一个查找的函数，一个Add函数，和导入导出的函数。

　　Add函数是原来的push改个名字，原因就是从我这里导出到文件的词是包含他的ID的，导入的时候的ID是固定的，所以改成在添加的时候要传入ID。对于提供的接口push，调用了Add函数。

　　改动过的Add和push如下：

 void Dictionary::AddWord(const string & word, int wordId)

 {

     vector<string> characters;

     splitWord(word, characters);

     vector<string>::iterator it_char;

     it_char = characters.begin();

     pDictElem root;

     root = _dictionary;

     for(; it_char != characters.end(); ++it_char)

     {

         WordIt it_word;

         it_word = root->_words.find(*it_char);

         if(it_word == root->_words.end())

         {

             pair<string, pDictElem> temp;

             temp.first = *it_char;

             pDictElem dictemp(new DictElem);

             dictemp->_word = *it_char;

             dictemp->_wordId = ;

             temp.second = dictemp;

             root->_words.insert(temp);

             root = dictemp;

         }else{

             root = it_word->second;

         }

     }

     if(!root->_wordId)

     {

         root->_wordId = wordId;

     }

 }

　　总感觉29行的if可以去掉呢--！但是没有去试

 void Dictionary::push(const string & word)

 {

     ++(_dictionary->_wordId);

     AddWord(word, _dictionary->_wordId);

 }

 void Dictionary::push(vector<string> & words)

 {

     int size = words.size();

     for(int i = ; i < size; ++i)

     {

         push(words[i]);

     }

 }

　　对root的wordID的自增放到了这里。这样，导和的时候就可以调用Add函数了。（好吧我就是懒得重新写一个插入的函数0。0）

　　接下来是查找的函数，比较简单，直接把Add拷过来改了一下：

 int Dictionary::search(const string & word)

 {

     vector<string> characters;

     splitWord(word, characters);

     vector<string>::iterator it_char;

     it_char = characters.begin();

     pDictElem root;

     root = _dictionary;

     for(; it_char != characters.end(); ++it_char)

     {

         WordIt it_word;

         it_word = root->_words.find(*it_char);

         if(it_word == root->_words.end())

         {

             return -;

         }else{

             root = it_word->second;

         }

     }

     return root->_wordId;

 }

　　关于以上，有另外一个想法：不管是在插入还是查找，都是要先“查找”，看看有没有那个节点。对于查找操作，只要返回有或者没有就行了；而对于插入，如果没有就返回它的位置，从那里开始添加节点，这样似乎又可以复用一些代码了 0。0

　　最后就是导入和导出了。

　　关于这块，写之前一直在纠结，是要遍历所有词，把词完整地存下来。采用json的格式，对于“编程学习”与“编程训练”：

　　直接存进文件，后面带上ID的效果：

 [

     { "word" : "编程学习" , "id" : },

     { "word" : "编程训练" , "id" : )

 ]

　　还是把树按节点存下来，

（后来发现其实这是一个序列化与反序列化的问题——2016.12.4 23:54 注）

 [

     {"key" : "编" , "flag" : , "id" :  , "value" :

 　　　　{"key" : "程" , "flag" :  , "id" :  , "value" :

 　　　　　　{"key" : "学" , "flag" :  , "id" : , "value" :

 　　　　　　　　{"key" : "习" , "flag" :  , "id" : , "value" : "NULL"

 　　　　　　　　}

 　　　　　　},

 　　　　　　{"key" : "训" , "flag" :  , "id" : , "value" :

 　　　　　　　　{"key" : "练" , "flag" :  , "id" : , "value" : "NULL"

 　　　　　　　　}

 　　　　　　}

 　　　　}

 　　}

 ]

　　大致是这样一层嵌套一层的结构。虽然这种结构可以用栈来实现，即把Json::Value型数据入栈，但是后来想想这样虽然一次只存一个结点，好像很节约空间，但是每个节点还要存一些额外的信息，逻辑似乎比较复杂，于是决定用第一种方法来实现，比较简单粗爆。

　　代码如下：

 void Dictionary::leading_out()

 {

     Json::Value root;

     Json::FastWriter writer;

     resetPoint();

     while(!isEnd())

     {

         Json::Value elem;

         elem["Word"] = getCurWord();

         elem["WordId"] = getCurWordId();

         root.append(elem);

         next();

     }

     string words;

     words = writer.write(root);

     ofstream ofs;

     const char * path = _conf.getDictionaryPath().c_str();

     ofs.open(path);

     if(!ofs.good())

     {

         cout << "open Dictionary.json error(leading_out)" << endl;

         ofs.open("Dictionary.tmp");

         if(!ofs.good())

         {

             exit(EXIT_FAILURE);

         }

     }

     ofs << words;

     ofs.close();

 }

　　存入文件的效果如下：

 [{"Word":"编程软件","WordId":},{"Word":"编程学习","WordId":},{"Word":"编程学习网站","WordId":},{"Word":"编程入门","WordId":}]

　　只有一行（不要在意id顺序）。

　　之前想过一种方法能够让它以多行的形式存到文件中，做了如下处理：

     .

     .

     .

 ofs << "[" << endl;

 int size = root.size();

 for(int i = ; i < size; ++i)

 {

     string json_file = writer.write(root[i]);

     int len = json_file.size();

     json_file[len - ] = ' ';

     ofs << "\t" << json_file;

     if(i != size -)

     {

         ofs << "," << endl;

     }

     }

 ofs << endl << "]" << endl;

 ofs.close();

     .

     .

     .

　　一项一项地取出，把最后的'\n'替换成空格，然后自己控制输出格式。

　　导入格式要与导出格式对应，也使用json，如下：

 void Dictionary::leading_in()

 {

     ifstream ifs;

     const char * path = _conf.getDictionaryPath().c_str();

     ifs.open(path);

     if(!ifs.good())

     {

         cout << "open Dictionary.json error(leading_in)" << endl;

     }else{

         Json::Value root;

         Json::Reader reader;

         if(!reader.parse(ifs, root, false))

         {

             cout << "json read Dictionary.json error" << endl;

         }else{

             int size = root.size();

             for(int i = ; i < size; ++i)

             {

                 string word = root[i]["Word"].asString();

                 int wordId = root[i]["WordId"].asInt();

                 AddWord(word, wordId);

                 ++(_dictionary->_wordId);

             }

         }

     }

 }

　　这里要说明一下，导出和导入打开文件如果失败了没有exit的原因：

　　对于导出，如果打开一个文件失败了直接exit，那内存中的东西就全部丢失了，于是就采用在当路径下创建一个临时文件来存要存的内容。

　　对于导入，打开文件失败只是说明路径有问题或者文件本身有问题，修改一下就可以了，或者只是没有导入词典而已，程序中如果已经添加了一些词，此时退出也会使内容丢失。

　　当然，目前的机制还是有问题的，比如词ID的设置，如果在内存中已有词的情况下导入新词，可能会使ID错乱。对于导出，或许可以设置一个机制：任何位置要退出的时候都把内容存入临时文件中，可能会更好。

　　做到这里，突然又想实现词的补全功能：传入“编程”，能传出“学习”、“学习网站”等的集合，这个做好了再补上吧！