博客导出工具(C++实现,支持sina,csdn,自定义列表)

操作系统：windowAll

编程工具：visual studio 2013

编程语言：VC++

最近博文更新的较频繁，为了防止账号异常引起****博文丢失，所以花了点时间做了个小工具来导出博文，用做备份。本文将从源码分析整个实现过程。先看个截图：

博客导出工具(C++实现,支持sina,****,自定义列表)

操作步骤：

先在博客地址文本框输入博客地址例如：http://blog.****.net/yxstars/，http://blog.sina.com.cn/yxstars/，

http://www.cnblogs.com/yxstars/
然后点击确定，将显示共有多少篇博文，例如：[19:32:47]博文113篇
点击文章列表：将显示所有博文，格式：title，href
点击导出博文：将导出博文，在当前目录下的blog文件夹中。博文格式为html。
遍历博文：将遍历所有博文并且显示出来。
刷新：刷新所有博文，不显示。
图片：导出的博文，图片下载到本地，博文图片链接到本地。
列表：支持自定义的列表链接博文（当前目录下有个list.ini,可以自定义链接）。
刷新次数：自定义，循环次数。
时间间隔：每次循环sleep时间。

源码分析：

1. 获取对应的url页面源代码，实现如下：

bool CBlogExportDlg::GetUrlStr(CString strUrl, CString& UrlData)

{

	CInternetSession session;

	CHttpFile *file = NULL;

	try{

		file = (CHttpFile*)session.OpenURL(strUrl);

	}

	catch (CInternetException *m_pException){

		file = NULL;

		m_pException->m_dwError;

		m_pException->Delete();

		session.Close();

		ShowMes("网络连接错误...");

		return false;

	}

	if (!file){

		ShowMes(strUrl + "获取失败...");

		return false;

	}

	CString sRecived;

	while (file->ReadString(sRecived) != NULL) {

		UrlData += sRecived + "\n";

	}

	session.Close();

	file->Close();

	delete file;

	file = NULL;

	return true;

}

2. 获取的html源码为utf8格式，需要转为ansi格式，C++实现代码如下：

int CBlogExportDlg::ConvUtf8ToAnsi(CString& strSource, CString& strChAnsi)

{

	if (strSource.GetLength() <= 0)

		return 0;

	CString strWChUnicode;

	strSource.TrimLeft();

	strSource.TrimRight();

	strChAnsi.Empty();

	int iLenByWChNeed = MultiByteToWideChar(CP_UTF8, 0,

		strSource.GetBuffer(0),

		strSource.GetLength(), //MultiByteToWideChar

		NULL, 0);

	int iLenByWchDone = MultiByteToWideChar(CP_UTF8, 0,

		strSource.GetBuffer(0),

		strSource.GetLength(),

		(LPWSTR)strWChUnicode.GetBuffer(iLenByWChNeed * 2),

		iLenByWChNeed); //MultiByteToWideChar

	strWChUnicode.ReleaseBuffer(iLenByWchDone * 2);

	int iLenByChNeed = WideCharToMultiByte(CP_ACP, 0,

		(LPCWSTR)strWChUnicode.GetBuffer(0),

		iLenByWchDone,

		NULL, 0,

		NULL, NULL);

	int iLenByChDone = WideCharToMultiByte(CP_ACP, 0,

		(LPCWSTR)strWChUnicode.GetBuffer(0),

		iLenByWchDone,

		strChAnsi.GetBuffer(iLenByChNeed),

		iLenByChNeed,

		NULL, NULL);

	strChAnsi.ReleaseBuffer(iLenByChDone);

	if (iLenByWChNeed != iLenByWchDone || iLenByChNeed != iLenByChDone)

		return 1;

	return 0;

}

3. 消息文本框显示

void CBlogExportDlg::ShowMes(CString mes)

{

	CTime time;

	time = CTime::GetCurrentTime();//Get the current time

	CString Times = _T("[") + time.Format("%H:%M:%S") + "]";//Conversion time format

	int len = MesEdit.GetWindowTextLength();

	MesEdit.SetSel(len, len);

	MesEdit.ReplaceSel(Times + mes + _T("\r\n"));

}

4. 点击确定按钮后，实现代码

void CBlogExportDlg::OnBnClickedButtonOk()

{

	GetDlgItemText(IDC_EDIT_ADDRESS, blogAdr);

	ShowBlogAdr();

	//blogAdr = ("http://blog.****.net/yxstars/");

	int pos = blogAdr.Find("http://blog.****.net/");

	if (pos == -1){

		ShowMes("**** blog地址不对...");

	}

	blogAdrs = blogAdr;

	CString urlData;

	if (!GetUrlStr(blogAdr, urlData)){

		return;

	}

	CFile fs;

	if (!fs.Open(strDirPath + "temp", CFile::modeCreate | CFile::modeWrite)){

		return;

	}

	fs.Write(urlData, urlData.GetLength());

	fs.Close();

	CString ansiUrlData;

	ConvUtf8ToAnsi(urlData, ansiUrlData);

	GetBlogInfo(ansiUrlData);

}

5. 根据博客地址，获取源代码后分析，查找博文数目，和博文列表页数。

<!--显示分页 -->

<div id="papelist" class="pagelist">

<span> 113条数据  共6页</span><strong>1</strong> <a href="/yxstars/article/list/2">2</a> <a href="/yxstars/article/list/3">3</a> <a href="/yxstars/article/list/4">4</a> <a href="/yxstars/article/list/5">5</a> <a href="/yxstars/article/list/6">...</a> <a href="/yxstars/article/list/2">下一页</a> <a href="/yxstars/article/list/6">尾页</a>

</div>

从上面的代码中可以获取信息如下：

113条数据共6页, 共有113篇博文，共有6页。

<a href="/yxstars/article/list/3">，页面链接地址为/yxstars/article/list/
+ 要显示的页数。

C++代码实现如下：

void CBlogExportDlg::GetBlogInfo(CString& urlData)

{

	int pos = urlData.Find("<div id=\"papelist\" class=\"pagelist\">");

	if (pos == -1){

		ShowMes("获取列表数目失败...");

		return;

	}

	urlData = urlData.Mid(pos + 44);

	pos = urlData.Find("条数据");

	if (pos == -1){

		ShowMes("获取列表条数失败...");

		return;

	}

	CString blogListNum = urlData.Left(pos);

	pos = urlData.Find("条数据  共");

	int poss = urlData.Find("页</span>");

	if ((poss == -1) || (pos == -1)){

		ShowMes("获取列表页数失败...");

		return;

	}

	CString listPage = urlData.Mid(pos + 10, poss - pos - 10);

	blogListPage = StrToInt(listPage);

	ShowMes("博文" + blogListNum + "篇");

}

6. 当点击显示列表时，根据之前的页面地址获取信息。

void CBlogExportDlg::OnBnClickedButtonList()

{

	clearMes();

	CString urlData, ansiUrlData, listPage;

	//http://blog.****.net/yxstars/article/list/1

	FileListMap.clear();

	listNum = 1;

	for (int i = 1; i < blogListPage + 1; i++){

		urlData.Empty();

		ansiUrlData.Empty();

		listPage.Format("%d", i);

		blogAdr = blogAdrs + "/article/list/" + listPage;

		ShowBlogAdr();

		if (!GetUrlStr(blogAdr, urlData)){

			return;

		}

		ConvUtf8ToAnsi(urlData, ansiUrlData);

		GetFileList(ansiUrlData);

	}

}

7. 在每个页面获取文章列表和页面地址。

    <h1>

        <span class="link_title"><a href="/yxstars/article/details/38469431">

        <font color="red">[置顶]</font>

        金融系列12《双币电子现金方案》

        </a></span>

    </h1>

从上面源码可以看出：

后面就是博文链接地址。

</a>前面的就是博文标题。

如果有置顶操作，会多出这部分[置顶]

C++获取源码实现如下：

void CBlogExportDlg::GetFileList(CString& urlData)

{

	CString strListNum;

	int posF = urlData.Find("<span class=\"link_title\">");

	while (posF != -1){

		urlData = urlData.Mid(posF + 34);

		int posE = urlData.Find("\"");

		if (posE == -1){

			ShowMes("获取列表失败...");

			return;

		}

		CString href = urlData.Left(posE);

		posF = urlData.Find("</a>");

		if (posF == -1){

			ShowMes("获取列表失败...");

			return;

		}

		CString title = urlData.Mid(posE+2, posF-posE-2);

		posF = title.ReverseFind('>');

		if (posF != -1){

			title = title.Mid(posF + 1);

		}

		title.Trim("\n").Trim();

		href = "http://blog.****.net" + href;

		FileListMap[title] = href;

		strListNum.Format("%03d", listNum++);

		strListNum = (strListNum + ":" + title + "                                            ").Left(45);

		ShowMes(strListNum + href);

		posF = urlData.Find("<span class=\"link_title\">");

	}

}

8. 当点击导出博文时，我们只需把源代码保存为html格式即可，采用多线程实现：

void CBlogExportDlg::OnBnClickedButtonExport()

{

	clearMes();

	unsigned tid;

	unsigned long thd = _beginthreadex(NULL, 0, CBlogExportDlg::WriteCycle, this, 0, &tid);

	if (thd != NULL)

	{

		CloseHandle((HANDLE)thd);

	}

}

unsigned __stdcall  CBlogExportDlg::WriteCycle(void* p)

{

	CBlogExportDlg* dlg = (CBlogExportDlg*)p;

	CString blogFolderPath = dlg->strDirPath + "Blog\\";

	if (!PathIsDirectory(blogFolderPath))

	{

		if (!CreateDirectory(blogFolderPath, NULL))

		{

			dlg->ShowMes(blogFolderPath + "创建失败...");

			return 1;

		}

	}

	dlg->stopRun = false;

	CString urlData, strList;

	int iList = 1;

	CFile cf;

	std::map<CString, CString>::iterator iter;

	for (iter = dlg->FileListMap.begin(); iter != dlg->FileListMap.end(); iter++){

		//dlg->blogAdr = iter->second;

		//dlg->ShowBlogAdr();

		urlData.Empty();

		if (!dlg->GetUrlStr(iter->second, urlData)){

			return 1;

		}

		strList.Format("%3d", iList++);

		dlg->ShowMes("正在导出第" + strList + "篇博文：" + iter->first);

		CString blogPath(iter->first);

		blogPath.Replace('\\', '_');

		blogPath.Replace('/', '_');

		blogPath = blogFolderPath + blogPath + ".html";

		if (!cf.Open(blogPath, CFile::modeCreate | CFile::modeWrite)){

			dlg->ShowMes("创建文件失败" + blogPath);

			return 2;

		}

		cf.Write(urlData, urlData.GetLength());

		cf.Close();

		if (dlg->stopRun){

			return 1;

		}

	}

	return 0;

}

9. 遍历博文时，只需依次访问之前保存的链接即可，实现如下：

void CBlogExportDlg::OnBnClickedButtonRead()

{

	clearMes();

	unsigned tid;

	unsigned long thd = _beginthreadex(NULL, 0, CBlogExportDlg::ReadCycle, this, 0, &tid);

	if (thd != NULL)

	{

		CloseHandle((HANDLE)thd);

	}

}

unsigned __stdcall  CBlogExportDlg::ReadCycle(void* p)

{

	CBlogExportDlg* dlg = (CBlogExportDlg*)p;

	dlg->stopRun = false;

	std::map<CString, CString>::iterator iter;

	for (iter = dlg->FileListMap.begin(); iter != dlg->FileListMap.end(); iter++){

		dlg->blogAdr = iter->second;

		dlg->ShowBlogAdr();

		dlg->ShowMes("正在遍历博文：" + iter->first);

		Sleep(3000);

		if (dlg->stopRun){

			return 1;

		}

	}

	return 0;

}

****免积分下载地址：

2014.08.01更新： http://download.****.net/detail/yxstars/7786309

2014.09.05更新：http://download.****.net/detail/yxstars/7867583

秒客网