K-meams文本聚类算法C++实现

时间:2023-03-10 06:50:56
K-meams文本聚类算法C++实现

FROM:http://www.cnblogs.com/finallyliuyu/archive/2010/09/03/1817348.html

 头文件:

  #ifndef _Preprocess_H
#define _Preprocess_H
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include"ictclas30.h"
#include"boost\tr1\regex.hpp"
#include"boost/algorithm/string.hpp"
#include"windows.h" //一些谓词函数
using namespace std; class Preprocess
{
//typedef vector(Preprocess::*FUNCSEG)(string,set);
private:
char *bagofwordsAddress;//存放词袋子模型的位置
char * featurewordsAddress;//存放特征词文件的位置;
char *arffFileAddress;//存放ARFF文件的位置
char *infoFromWekaAddress;//存放调用weka后的实验结果
char *articleIdsAddress;//存放被聚类的文章的ID号
char *dbconnection;//数据库的链接字符串
char *dbselect;//数据库select语句
char *dbfield;//数据库字段
int beginIndex;//开始聚类的文章id
int endIndex;//结束聚类的文章id
public:
typedef vector(Preprocess::*FUNCSEG)(string,set);
Preprocess(int c_style_stringsize,const char *mydict,const char *keywordsinfo,const char *tobeCluster,const char * InfoFromWeka,const char *artileIds,const char *conn,const char *selectsql, int beginIndex,int endIndex)
{
bagofwordsAddress=new char[c_style_stringsize];
featurewordsAddress=new char[c_style_stringsize];
arffFileAddress=new char[c_style_stringsize];
infoFromWekaAddress=new char[c_style_stringsize];
articleIdsAddress=new char[c_style_stringsize];
dbconnection=new char[c_style_stringsize];
dbselect=new char[c_style_stringsize];
this->beginIndex=beginIndex;
this->endIndex=endIndex;
sprintf_s(bagofwordsAddress,c_style_stringsize,mydict);
sprintf_s(featurewordsAddress,c_style_stringsize,keywordsinfo);
sprintf_s(arffFileAddress,c_style_stringsize,tobeCluster);
sprintf_s(infoFromWekaAddress,c_style_stringsize,InfoFromWeka);
sprintf_s(articleIdsAddress,c_style_stringsize,artileIds);
sprintf_s(dbconnection,c_style_stringsize,conn);
sprintf_s(dbselect,c_style_stringsize,selectsql); } ~Preprocess()
{
delete []bagofwordsAddress;
delete []featurewordsAddress;
delete []arffFileAddress;
delete [] infoFromWekaAddress;
delete []articleIdsAddress;
delete []dbconnection;
delete []dbselect; }
void trim(string &str,const string val);//去除字符串首尾空白
//构建倒排表: key=word,val= a list of pairs which consists of articleid,and count, count=tf
int ConstructMap(mapint,int>>>&mymap,char *dbfield,FUNCSEG seg);
inline void TruncateArff()
{
ofstream ofile;
ofile.open(arffFileAddress,ios::trunc);
ofile.close();
}
//保存词袋子到硬盘
void save(mapint,int> > >&mymap);
//从内存中加载词袋子模型
void load(mapint,int> > >&mymap);
//打印词袋子模型
void print(mapint,int> > >&mymap);
//窄字符串转化成宽字符串
wstring myMultibyteToWideChar(string sResult);
//宽字符串转化成窄字符串
string myWideCharToMultibyte(wstring wsResult);
//调用ICTclass分词
string ICTsplit(const char *sInput);
//构造停用词表
setMakeStopSet();
//去除停用词,噪声词
vectorgoodWordsinPieceArticle(string rawtext,set stopwords);
//整数转化成字符串
string do_fraction(int val);
//浮点数转化成字符串
string do_fraction(double val, int decplaces=);
//特征词选择算法
void DFcharicteristicWordSelection(mapint,int>>> &mymap,int DFthreshold);
//获取最后的特征词
vector GetFinalKeyWords();
//获取特征词的maxTF,DF
vectorint,int> >GetfinalKeysMaxTFDF(mapint,int>>> &mymap);
//文档向量模型规范化
vectorint,double> > NormalizationVSM(vectorint,double> > tempVSM);
//建立文档向量模型并且写到arff文件里
void VSMFormation(mapint,int>>> &mymap); string FormatVSMtoString(vectorint,double> > tempVSM);
//写Arff文件头部
void WriteHeadArff();
void WriteTotalArff(char * dbfield,int DFthreshlod,bool isbagOfwordsexsist,FUNCSEG seg); map<</code>int,vector<</code>double> >VSMConstruction(mapint,int>>> &mymap); map<</code>double> > GetClusters(); double CalDotProductOfVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2); double CalCosineofVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2); vectorint,string> >GenerateClusterInfo(map<</code>int,vector<</code>double> >&vsmMatrix, map<</code>double> >&clusters); map<</code>int> >FetchArticlesOFClusters(map<</code>double> >&clusters,vectorint,string>>&resultInfo);
void RetreiveArticleInfoFromDataBase();
vector mySplit(string s,set stopwords);//分割关键词 }; #endif Preprocess类的函数功能实现文件: #include"stdafx.h"
#include "Preprocess.h" #pragma comment(lib, "ICTCLAS30.lib")
using namespace std;
bool isLonger(const pairint> &pair1, const pairint> &pair2)
{
return pair1.second>pair2.second;
}
bool cntAssist(const pairint> &pair1)
{
return pair1.second<=;
}
bool PredTF(const pair<</code>int,int>& pair1,int articleId)
{
return pair1.first==articleId; }
class PredTFclass
{
private: const int m;
public:
PredTFclass(int id):m(id){};
bool operator()(const pair<</code>int,int>& pair1){return PredTF(pair1,m);};
};
bool myCmp(const pairdouble>&pair1,const pairdouble>&pair2 )
{
return pair1.second>=pair2.second;
} void Preprocess:: trim(string &str,const string val)
{
str.erase(,str.find_first_not_of(val));
str.erase(str.find_last_not_of(val)+val.size());
}
int Preprocess::ConstructMap(mapint,int>>>&mymap,char *dbfield,FUNCSEG seg)
{
//setMakeStopSet();
CoInitialize(NULL);
_ConnectionPtr pConn(__uuidof(Connection));
_RecordsetPtr pRst(__uuidof(Recordset));
pConn->ConnectionString=dbconnection;
pConn->Open("","","",adConnectUnspecified);
pRst=pConn->Execute(dbselect,NULL,adCmdText);
setstopwords=MakeStopSet(); while(!pRst->rsEOF)
{ vectorwordcollection;
//string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");
string rawtext=(_bstr_t)pRst->GetCollect(dbfield);
if(rawtext!="")
{
wordcollection=(this->*seg)(rawtext,stopwords);
string tempid=(_bstr_t)pRst->GetCollect("ArticleId");
int articleid=atoi(tempid.c_str());
for(vector::iterator strit=wordcollection.begin();strit!=wordcollection.end();strit++)
{
vectorint,int>>::iterator it;
if(mymap[*strit].empty())
{
pair<</code>int,int>mytemppair=make_pair(articleid,);
mymap[*strit].push_back(mytemppair); }
else
{
for(it=mymap[*strit].begin();it!=mymap[*strit].end();it++)
{
if(it->first==articleid)
{
it->second=++(it->second);
break;
} }
if(it==mymap[*strit].end())
{
pair<</code>int,int>mytemppair=make_pair(articleid,);
mymap[*strit].push_back(mytemppair);
} } } } pRst->MoveNext();
wordcollection.clear();
}
pRst->Close();
pConn->Close();
pRst.Release();
pConn.Release();
CoUninitialize(); return ; }
void Preprocess::save(mapint,int> > >&mymap)
{
ofstream outfile(bagofwordsAddress,ios::binary);
outfile<<mymap.size()<<endl;
mapint,int> > >::iterator it;
for (it=mymap.begin();it!=mymap.end();it++)
{ outfile<<it->first<<endl;
vectorint,int>>::iterator subit;
outfile<<it->second.size()<<endl;
for(subit=(it->second).begin();subit!=(it->second).end();++subit)
{
outfile<<subit->first<<" "<<subit->second<<" "<<";"<<" ";
}
outfile<<endl;
}
//outfile.write((char *)&mymap,sizeof(mymap)); outfile.close(); }
void Preprocess::load(mapint,int> > >&mymap)
{
std::locale loc1 = std::locale::global(std::locale(".936"));
{
// 在这里使用std::ifstream 或者 std::fstream
ifstream infile(bagofwordsAddress,ios::binary);
int lenMyMap;//保存词典长度
int lenVector;//保存每个词出现的文章数目
string key;//保存读出的map的键值
int articleId;//文章标号
int count;//在该文章中刚出现的数目
string comma;
string semicolon;
infile>>lenMyMap;
while(!infile.eof())
{
infile>>key;
infile>>lenVector;
vectorint,int> >temp;
for (int i=;i
{
infile>>articleId>>count>>semicolon;
temp.push_back(make_pair(articleId,count));
}
mymap[key]=temp; } infile.close();
}
std::locale::global(std::locale(loc1)); }
void print(mapint,int> > >&mymap)
{
cout<<mymap.size()<<endl;
mapint,int> > >::iterator it;
for (it=mymap.begin();it!=mymap.end();it++)
{ cout<<it->first<<endl;
vectorint,int>>::iterator subit;
cout<<it->second.size()<<endl;
for(subit=(it->second).begin();subit!=(it->second).end();++subit)
{
cout<<subit->first<<','<<subit->second<<";";
}
cout<<endl;
} }
set Preprocess::MakeStopSet()
{
set stopwordsSet;
ifstream ifile("stopwords.txt");
while(!ifile.eof())
{
string temp;
trim(temp," ");
ifile>>temp;
stopwordsSet.insert(temp);
}
return stopwordsSet;
} string Preprocess::do_fraction(int val)
{
ostringstream out;
out<<val;
string str= out.str(); //从流中取出字符串
str.swap(string(str.c_str()));//删除nul之后的多余字符
return str; }
string Preprocess::do_fraction(double val,int decplaces)
{ //int prec=numeric_limits::digits10;
char DECIMAL_POINT='.';
ostringstream out;
//out.precision(prec);
out<<val;
string str=out.str();
size_t n=str.find(DECIMAL_POINT);
if((n!=string::npos)&&n+decplaces
{
str[n+decplaces]='\0';
}
str.swap(string(str.c_str())); return str;
}
wstring Preprocess::myMultibyteToWideChar(string sResult)
{
int iWLen=MultiByteToWideChar( CP_ACP, , sResult.c_str(), sResult.size(), , );// 计算转换后宽字符串的长度。(不包含字符串结束符)
wchar_t *lpwsz= new wchar_t [iWLen+];
MultiByteToWideChar( CP_ACP, , sResult.c_str(), sResult.size(), lpwsz, iWLen ); // 正式转换。
lpwsz[iWLen] = L'\0';
wstring wsResult(lpwsz);
delete []lpwsz;
return wsResult;
}
string Preprocess::myWideCharToMultibyte(wstring wsResult)
{
string sResult;
int iLen= WideCharToMultiByte( CP_ACP, NULL, wsResult.c_str(), -, NULL, , NULL, FALSE ); // 计算转换后字符串的长度。(包含字符串结束符)
char *lpsz= new char[iLen];
WideCharToMultiByte( CP_OEMCP, NULL, wsResult.c_str(), -, lpsz, iLen, NULL, FALSE); // 正式转换。
sResult.assign( lpsz, iLen- ); // 对string对象进行赋值。
delete []lpsz;
return sResult; }
string Preprocess::ICTsplit(const char *sInput)
{
if(!ICTCLAS_Init())
{
printf("ICTCLAS INIT FAILED!\n");
string strerr(sInput);
return strerr;
}
ICTCLAS_SetPOSmap(ICT_POS_MAP_SECOND);
//导入用户词典后 const char* sResult = ICTCLAS_ParagraphProcess(sInput, );
string strresult(sResult);
//printf("%s\n", sResult);
//把字符串转化成宽字符串
wstring wsResult=myMultibyteToWideChar(strresult);
boost::wregex wreg(L"\\s+");
wsResult=boost::regex_replace(wsResult,wreg,wstring(L"|"));
strresult=myWideCharToMultibyte(wsResult); //ofile<<str1;
//ofile.close();
//cout<<str1<<endl;
//ICTCLAS_FileProcess("text.txt","test_result.txt",1);
ICTCLAS_Exit(); return strresult;
}
vectorPreprocess::goodWordsinPieceArticle(string rawtext,set stopwords)
{
vector goodWordstemp;
vector goodWords;
const char* sInput=rawtext.c_str();
string sResult=ICTsplit(sInput);
wstring wsResult=myMultibyteToWideChar(sResult);
boost::wregex wreg(L"\\d+");//去掉中文空格
wsResult=boost::regex_replace(wsResult,wreg,wstring(L""));
//boost::regex_split(back_inserter(goodWordstemp),wsResult,wreg);
boost::split(goodWordstemp,wsResult,boost::is_any_of("|")); for(vector::iterator it=goodWordstemp.begin();it!=goodWordstemp.end();it++)
{
string temp=myWideCharToMultibyte(*it);
trim(temp," ");
if(!stopwords.count(temp)&&!temp.empty())
{
goodWords.push_back(temp);
} } return goodWords;
}
void Preprocess::DFcharicteristicWordSelection(mapint,int>>> &mymap,int DFthreshold)
{
int finalKeyWordsCount=;//计算共取了多少个关键词
vectorint> >tempvector;
for(mapint,int>>>::iterator it=mymap.begin();it!=mymap.end();++it)
{
tempvector.push_back(make_pair(it->first,(it->second).size()));
} stable_sort(tempvector.begin(),tempvector.end(),isLonger);
ofstream outfile(featurewordsAddress);
for(vectorint> >::iterator it=tempvector.begin();it!=tempvector.end();it++)
{
if(it->second>=DFthreshold)
{
//outfile<<it->first<<" "<<it->second<<endl;
outfile<<it->first<<endl;
finalKeyWordsCount++; } }
outfile.close();
cout<<"最后共选择特征词"<<finalKeyWordsCount<<endl;
cout<<"by the way,DFthreshold equals"<<DFthreshold<<endl; }
vectorPreprocess::GetFinalKeyWords()
{
vectormyKeys;
ifstream infile(featurewordsAddress);
while(!infile.eof())
{
string temp;
infile>>temp;
if(temp!="")
{
myKeys.push_back(temp);
} }
return myKeys;
}
vectorint,int> >Preprocess::GetfinalKeysMaxTFDF(mapint,int>>> &mymap)
{
vectorint,int> >maxTFandDF;
vectormyKeys=GetFinalKeyWords();
for(vector::iterator it=myKeys.begin();it!=myKeys.end();it++)
{
int DF=mymap[*it].size();
int maxTF=;
for(vectorint,int> >::iterator subit=mymap[*it].begin();subit!=mymap[*it].end();subit++)
{
if(subit->second>maxTF)
{
maxTF=subit->second;
} }
maxTFandDF.push_back(make_pair(maxTF,DF));
//find_if(mymap[*it].begin(),mymap[*it].end(),
}
return maxTFandDF;
}
vectorint,double> >Preprocess::NormalizationVSM(vectorint,double> > tempVSM)
{ double sum=;
for(vectorint,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)
{
sum+=pow(vsmit->second,);
}
for(vectorint,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)
{
vsmit->second/=sqrt(sum);
}
return tempVSM; }
string Preprocess::FormatVSMtoString(vectorint,double> > tempVSM)
{
string ret="{";
int commaindication=;
for(vectorint,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)
{ ret+=do_fraction(vsmit->first)+" "+do_fraction(vsmit->second,);
if(commaindication
{
ret+=",";
}
commaindication++;
}
ret+="}";
return ret;
}
void Preprocess::WriteHeadArff()
{
ofstream ofile(arffFileAddress,ios::binary);
ofile<<"@relation aticle"<<endl;
ofile<<"\n";
vector myKeys=GetFinalKeyWords();
for(vector::iterator it=myKeys.begin();it!=myKeys.end();it++)
{
//string temp="@attribute "+"'"+(*it)+"'"+" real";
string temp="";
temp+="@attribute ";
temp+="'";
temp+=*(it);
temp+="'";
temp+=" real"; ofile<<temp<<endl;
}
ofile<<"\n"<<endl;
ofile<<"@data"<<endl;
ofile.close();
}
void Preprocess::VSMFormation(mapint,int>>> &mymap)
{ int corpus_N=endIndex-beginIndex+;
ofstream ofile1(articleIdsAddress,ios::binary);//保存文章编号的文件
ofstream ofile2(arffFileAddress,ios::binary|ios::app); vector myKeys=GetFinalKeyWords();
vectorint,int> >maxTFandDF=GetfinalKeysMaxTFDF(mymap);
for(int i=beginIndex;i<=endIndex;i++)
{ vectorint,double> >tempVSM;
for(vector::size_type j=;j
{
//vector >::iterator findit=find_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
double TF=(double)count_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i)); TF=0.5+0.5*(double)TF/(maxTFandDF[j].first);
TF*=log((double)corpus_N/maxTFandDF[j].second);
if(TF!=)
{
tempVSM.push_back(make_pair(j,TF)); } }
if(!tempVSM.empty())
{
tempVSM=NormalizationVSM(tempVSM);
string vsmStr=FormatVSMtoString(tempVSM);
ofile1<<i<<endl;
ofile2<<vsmStr<<endl;
}
tempVSM.clear(); }
ofile1.close();
ofile2.close(); }
void Preprocess::WriteTotalArff(char *dbfield,int DFthreshold,bool isbagOfWordsExist,FUNCSEG seg)
{ mapint,int>>> mymap;
if(!isbagOfWordsExist)
{
ConstructMap(mymap,dbfield,seg);
save(mymap);
cout<<"词袋子信息已经保存到硬盘"<<endl;
}
else
{
load(mymap);
}
DFcharicteristicWordSelection(mymap,DFthreshold);
WriteHeadArff();
VSMFormation(mymap);
cout<<"arff文件已经形成"<<endl; string temp(infoFromWekaAddress); cout<<"请您将使用weka聚类,并保存为"<<temp<<endl;
}
map<</code>int,vector<</code>double> > Preprocess::VSMConstruction(mapint,int>>> &mymap)
{
int corpus_N=endIndex-beginIndex+;
map<</code>int,vector<</code>double>> vsmMatrix;
vector myKeys=GetFinalKeyWords();
vectorint,int> >maxTFandDF=GetfinalKeysMaxTFDF(mymap);
for(int i=beginIndex;i<=endIndex;i++)
{
vectorint,double> >tempVSM;
for(vector::size_type j=;j
{
//vector >::iterator findit=find_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
double TF=(double)count_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
TF=0.5+(double)TF/(maxTFandDF[j].first);
TF*=log((double)corpus_N/maxTFandDF[j].second);
tempVSM.push_back(make_pair(j,TF)); }
if(!tempVSM.empty())
{
tempVSM=NormalizationVSM(tempVSM);
for(vectorint,double> >::iterator it=tempVSM.begin();it!=tempVSM.end();it++)
{
vsmMatrix[i].push_back(it->second);
} }
tempVSM.clear(); }
return vsmMatrix; }
map<</code>double> > Preprocess::GetClusters()
{ map<</code>double> >clusters;
ifstream ifile(infoFromWekaAddress);
string temp;
while(getline(ifile,temp))
{ boost::smatch matchcluster;
boost::regex regcluster("Cluster\\s+\\d+",boost::regex::icase);
if(boost::regex_search(temp,matchcluster,regcluster))
{
string clustertmp=matchcluster[].str();
string ordinates="";
getline(ifile,ordinates);
boost::regex regordinates("\\d+(\\.\\d{1,4})?");
boost::smatch matchordinates;
std::string::const_iterator it=ordinates.begin();
std::string::const_iterator end=ordinates.end();
while (boost::regex_search(it,end,matchordinates,regordinates))
{
string digitstemp=matchordinates[].str();
double digitval=0.0;
std::stringstream ss;
ss<<digitstemp;
ss>>digitval;
clusters[clustertmp].push_back(digitval);
it=matchordinates[].second;
} }
}
return clusters;
}
double Preprocess::CalDotProductOfVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2)
{
double result = 0.0f;
for (int i = ; i < vector1.size(); i++)
result += vector1[i] * vector2[i];
return result;
}
double Preprocess::CalCosineofVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2)
{
double numerator=CalDotProductOfVectors(vector1,vector2);
double denominator=CalDotProductOfVectors(vector1,vector1)*CalDotProductOfVectors(vector2,vector2);
denominator=sqrt(denominator);
return numerator/denominator;
}
vectorint,string> > Preprocess::GenerateClusterInfo(map<</code>int,vector<</code>double> >&vsmMatrix, map<</code>double> >&clusters)
{
vectorint,string> >resultInfo;
for(map<</code>int,vector<</code>double> >::iterator it=vsmMatrix.begin();it!=vsmMatrix.end();it++)
{
vectordouble> >clusterDistanceAist;
for(map<</code>double> >::iterator clusterit=clusters.begin();clusterit!=clusters.end();clusterit++)
{ double temp=CalCosineofVectors(it->second,clusterit->second);
clusterDistanceAist.push_back(make_pair(clusterit->first,temp)); }
sort(clusterDistanceAist.begin(),clusterDistanceAist.end(),myCmp);
vectordouble> >::iterator cDAit=clusterDistanceAist.begin(); resultInfo.push_back(make_pair(it->first,cDAit->first));
clusterDistanceAist.clear();
}
return resultInfo; }
map<</code>int> > Preprocess::FetchArticlesOFClusters(map<</code>double> >&clusters,vectorint,string>>&resultInfo)
{
map<</code>int>> articlesInfo; for(vectorint,string>>::iterator retit=resultInfo.begin();retit!=resultInfo.end();retit++)
{
for(map<</code>double> >::iterator it=clusters.begin();it!=clusters.end();it++)
{
if(retit->second==it->first)
{
articlesInfo[it->first].push_back(retit->first);
}
}
} return articlesInfo; }
void Preprocess::RetreiveArticleInfoFromDataBase()
{
mapint,int>>> mymap;
vectorint,string>>resultInfo;
map<</code>double> >clusters;
map<</code>int,vector<</code>double> >vsmMatrix;
map<</code>int>> articlesInfo;
ofstream ofile("F:\\cluster\\ArticlesInPerCluster.txt");
//boost::regex_replace(strresult)
//ConstructMap(mymap,1,500);
//save(mymap);
load(mymap);
vsmMatrix=VSMConstruction(mymap);
clusters=GetClusters();
resultInfo=GenerateClusterInfo(vsmMatrix,clusters);
articlesInfo=FetchArticlesOFClusters(clusters,resultInfo); for(map<</code>int>>::iterator it=articlesInfo.begin();it!=articlesInfo.end();it++)
{
ostringstream out;
string selectassist;
char *selectsql=new char[];
int count=;
CoInitialize(NULL);
_ConnectionPtr pConn(__uuidof(Connection));
_RecordsetPtr pRst(__uuidof(Recordset));
pConn->ConnectionString=dbconnection;
pConn->Open("","","",adConnectUnspecified);
cout <<it->first<<endl;
ofile<<it->first<<endl;
out<<"(";
count=;
for(int i=;isecond.size();i++)
{
out<<(it->second)[i];
if(countsecond.size()-)
{
out<<",";
}
count++; }
out<<")";
selectassist=out.str();
sprintf_s(selectsql,,"%s %s","Select ArticleTitle,class from News Where ArticleId in ",selectassist.c_str()); pRst=pConn->Execute(selectsql,NULL,adCmdText);
while(!pRst->rsEOF)
{
//string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");
string title=(_bstr_t)pRst->GetCollect("ArticleTitle");
//string rawtext=(_bstr_t)pRst->GetCollect("ArticleText");
string categorization=(_bstr_t)pRst->GetCollect("class");
cout<<"文章标题:"<<title<<"文章所属类别: "<<categorization<<endl;
ofile<<"文章标题:"<<title<<"文章所属类别: "<<categorization<<endl; pRst->MoveNext(); }
pRst->Close();
pConn->Close();
pRst.Release();
pConn.Release();
CoUninitialize(); } ofile.close(); }
vectorPreprocess:: mySplit(string s,set stopwords)
{
vector wordCollection;
trim(s," "); int nPosBegin=;
int nPosEnd=s.find(' ',nPosBegin);
while(nPosEnd!=string::npos)
{
string temp=s.substr(nPosBegin,nPosEnd-nPosBegin);
trim(temp," ");
wordCollection.push_back(temp);
nPosBegin=s.find_first_not_of(' ',nPosEnd);
nPosEnd=s.find(' ',nPosBegin);
}
string temp=s.substr(nPosBegin,s.size()-nPosBegin);
trim(temp," ");
wordCollection.push_back(temp); return wordCollection; }