c++ boost 汉字和模式串混用的例子

时间:2023-03-10 02:46:52
c++ boost 汉字和模式串混用的例子
*===============================================================
* Copyright (C) All rights reserved.
*
* 文件名称:StringProcess.cpp
* 创 建 者:
* 创建日期:2013年04月24日
* 描 述:
* 备 注:
* 更新日志:
*
================================================================*/
#include<stdio.h>
#include<string.h>
#include<stdlib.h>
#include <sys/time.h>
#include<ctype.h>
#include<locale.h>
#include "boost/regex.hpp"
#include <iconv.h>
#include <errno.h>
#include<algorithm>
// please add your code here!
using namespace std;
#define MAX_LINE_LENGTH 1048576
#define TAGLEN 50
/************************************************************
* @brief <funcName:trim> Author:刘禹 finallyly 20130425 去掉字符串首尾空格
==================================================
* @param s
==================================================
**********************************************************/
void trim(char *s)
{
char *start;
char *end;
int len=strlen(s);
start=s;
end=s+len-;
while()
{
char c=*start;
if(!isspace(c))
{
break;
}
start++;
if(start>end)
{
s[]='\0';
return ;
}
}
while()
{
char c=*end;
if(!isspace(c))
{
break;
}
end --;
if(start>end)
{
s[]='\0';
return;
}
}
memmove(s,start,end-start+);
s[end-start+]='\0';
return;
} inline bool strTolower( char* str )
{
if ( !str )
return false;
int i = ;
bool flag = true;
while ( str[i] )
{
if ( 'A' <= str[i] && 'Z' >= str[i] )
{
str[i] += ;
}
else if ( 'a' <= str[i] && 'z' >= str[i] )
{
}
else
{
flag = false;
}
++i;
}
return flag;
} /************************************************************
* @brief <funcName:> Author:刘禹 finallyly
* 从系统默认的汉字编码本机是GBK转unicode,宽字符保存
==================================================
* @param sToMatch
==================================================
* @return
**********************************************************/
wstring String2Wstring(string sToMatch)
{
wstring wsToMatch;
setlocale( LC_CTYPE, "" ); // 很重要,没有这一句,转换会失败。
int iWLen = mbstowcs( NULL, sToMatch.c_str(), sToMatch.length() ); // 计算转换后宽字符串的长度。(不包含字符串结束符)
if(iWLen>)
{
wchar_t *lpwsz = new wchar_t[iWLen + ];
int i = mbstowcs( lpwsz, sToMatch.c_str(), sToMatch.length() ); // 转换。(转换后的字符串有结束符)
wsToMatch.assign(lpwsz);
delete []lpwsz;
}
else
{
wsToMatch=L"";
}
return wsToMatch;
}
/************************************************************
* @brief <funcName:> Author:刘禹 finallyly
* Unicode转系统自带编码,用于输出
==================================================
* @param sToMatch
==================================================
* @return
**********************************************************/
string Wstring2String(wstring sToMatch)
{
string sResult;
int iLen = wcstombs( NULL, sToMatch.c_str(), ); // 计算转换后字符串的长度。(不包含字符串结束符)
if(iLen>)
{
char *lpsz = new char[iLen + ];
int i = wcstombs( lpsz, sToMatch.c_str(), iLen ); // 转换。(没有结束符)
lpsz[iLen] = '\0';
sResult.assign(lpsz);
delete []lpsz;
}
else
{
sResult="";
}
return sResult;
}
/************************************************************
* @brief <funcName:> Author:刘禹 finallyly
* 从指定编码转换到目标编码
==================================================
* @param toCode
==================================================
* @param fromCode
==================================================
* @param srcstr
==================================================
* @param deststr
==================================================
* @param srclen
==================================================
* @param destlen
==================================================
* @return
**********************************************************/
int toAnotherCode(const char *toCode,const char *fromCode,char *srcstr, char *deststr, size_t srclen,size_t &destlen)
{
iconv_t convertor=iconv_open(toCode,fromCode);
size_t inputsize;
size_t outputsize;
size_t oldoutputsize;
char *input, *inputold;
char *output=NULL;
char *outputold=NULL;
int flag=;
if(convertor==iconv_t(-))
{
fprintf(stderr,"convertor device initailization failed!\n");
return ;
}
else
{
inputsize=srclen;
input=new char[inputsize+];
memcpy(input,srcstr,inputsize);
input[inputsize]='\0';
inputold=input;
outputsize=inputsize*;
oldoutputsize=outputsize;
output=new char[outputsize];
output[]=;
outputold=output;
size_t rc = iconv(convertor,&input,&inputsize,&output,&outputsize);
memcpy(deststr,outputold,oldoutputsize-outputsize);
deststr[destlen]=;
destlen=oldoutputsize-outputsize;
if(rc>)
{
flag=;
} delete []inputold;
delete []outputold; }
iconv_close(convertor);
if(flag==)
{
return ;
}
else
{
return ;
} }
/************************************************************
* @brief <funcName:PrintUsage> Author:刘禹 finallyly 20130424
==================================================
**********************************************************/
void PrintUsage()
{
fprintf( stderr, "prog [IN]hzpylist_file [IN]input_file [OUT]output_file [OUT]errdmp_file\n" );
}
void testRegex()
{
string s="刘禹,刘德华,刘佳佳。。。王大虎。。。刘长春,xixi";
string t="刘[^刘]*?,";
wstring p=String2Wstring(t);
wstring ws=String2Wstring(s);
boost::wregex wreg(p,boost::regbase::icase|boost::regex::perl);
boost::wsmatch wm;
vector<string> results;
wstring::const_iterator it=ws.begin();
wstring::const_iterator end=ws.end();
while(boost::regex_search(it,end,wm,wreg))
{
wstring wtemp=wm[];
string temp=Wstring2String(wtemp);
results.push_back(temp);
it=wm[].second;
}
fprintf(stdout,"输出正则匹配结果\n");
for(vector<string>::iterator it=results.begin();it!=results.end();it++)
{
printf("%s\n",(*it).c_str());
}
}
int LoadFile(char* inputfile)
{
FILE *fin = NULL;
char line[] = {};
char word[] = {};
int len = ;
fin = fopen(inputfile, "r");
if (NULL == fin)
{
fprintf(stderr,"LoadAddress can not open inputfilename %s\n", inputfile);
return ;
} while(true)
{
fgets(line, , fin);
if (feof(fin))
{
break;
}
len = strlen(line);
if ( == line[] || '\n' != line[len - ])
{
continue;
}
line[len - ] = ;
string pattern ="首都或首府:";
string p1="([\u2E80-\u9FFF])+";
wstring wp1 = String2Wstring(p1);
//wstring wpattern = L"([\u2E80-\u9FFF])+";
wstring wpattern = L"([\u2E80-\u9FFF]+)"+String2Wstring(pattern)+L"([\u2E80-\u9FFF]+)";
wstring winputstr = String2Wstring(line);
boost::wregex wreg(wpattern, boost::regex::perl|boost::regbase::icase);
boost::smatch what;
boost::wsmatch wswhat;
wstring::const_iterator wstrit = winputstr.begin();
wstring::const_iterator wstrend = winputstr.end();
while (boost::regex_search(wstrit, wstrend, wswhat, wreg))
{
wstring ws1 = wswhat[];
wstring ws2 = wswhat[];
string s1 = Wstring2String(ws1);
string s2 = Wstring2String(ws2);
fprintf(stdout, "%s\t%s\n", s1.c_str(), s2.c_str());
wstrit=wswhat[].second;
}
} if (NULL != fin)
{
fclose(fin);
fin = NULL;
}
return ;
}
int main( int argc, char *argv[] )
{
timeval tv1, tv2;
gettimeofday(&tv1, NULL); if ( != argc )
{
PrintUsage();
return ;
} LoadFile(argv[]);
gettimeofday(&tv2, NULL);
fprintf(stderr,"%s has finished congratulations!\n",argv[]);
fprintf( stderr,"time elapsed: %.2f ms\n", (float)((tv2.tv_sec - tv1.tv_sec)*+(tv2.tv_usec-tv1.tv_usec))/);
return ;
}