How to backup your blogs on cnblogs

时间:2023-03-08 21:08:30

This is an alternative to OfflineExplorer.

Thanks for this article[1] which is listed in Reference secton, and I modify several lines to adapt to my blogs. Here is the change list:

1. L193, change "homepage1_BottomPager" to  "homepage1_HomePageDays_BottomPager". Because I can't find "homepage1_BottomPager" in the source code of my cnblog web page at all.

2. L394, set url to your last page.

3. L396, set the output directory on your local disk.

Enjoy it!

 #! encoding=utf-8

 #cnblogs博客备份,使用方法:修改最下面的url和output,然后执行就可以了。

 import urllib2
import re
import os
import sys
# from HTMLParser import HTMLParser
import html5lib
# from xml.etree.ElementTree import ElementTree
from urlparse import urlparse
import xml
import codecs
import traceback
import time # class MyHTMLParser(HTMLParser): # def handle_starttag(self, tag, attrs):
# # if tag.lower() == "img":
# print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs))
# for x in attrs:
# print "name %s,value %s" % (x[0],x[1])
# def handle_endtag(self, tag):
# print "Encountered the end of a %s tag" % tag # def handle_startendtag(self, tag, attrs):
# print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs))
# for x in attrs:
# print "name %s,value %s" % (x[0],x[1]) # 资源尝试次数
gTestTime = 5 def DownloadFile(url,output):
responseText = None
dirssPath = None
try:
res = urlparse(url)
url = res.scheme+"://"+res.netloc+res.path
path = res.path
index = path.rfind('/')
dirss = "/"
if index != -1:
dirss = output + "/" + res.netloc.encode("utf-8") + path[0:index].encode("utf-8")
dirssPath = output + "/" + res.netloc.encode("utf-8") + path.encode("utf-8")
dirss_ansi = dirss.decode('utf-8')
if not os.path.exists(dirss_ansi):
os.makedirs(dirss_ansi)
global gTestTime
count = gTestTime
while True:
if count < 0:
break
count = count - 1
header={"User-Agent": "Mozilla-Firefox5.0"}
if not url.startswith("http://"):
break
try:
# print "url: %s:%d" % (url,count)
time.sleep(0.5)
request = urllib2.Request(url,None,header)
response = urllib2.urlopen(request)
dirssPath_ansi = dirssPath.decode("utf-8")
if not os.path.exists(dirssPath_ansi):
resourceFile = open(dirssPath_ansi,"wb")
responseText = response.read()
if url.endswith(".js"):
responseText = responseText.replace("http://","")
responseText = responseText.replace("https://","")
resourceFile.write(responseText)
resourceFile.close()
break
except Exception,e:
print "DownloadFile: %s:%s:%d" % (e,url,count)
# pass
# exstr = traceback.format_exc()
# print exstr except Exception,e:
pass
# exstr = traceback.format_exc()
# print exstr return (responseText,url,output) def ReadCss(css):
# print "ReadCss"
mode = 'url\(\"?([^)]+)\"?\)'
pattern = re.compile(mode)
try:
text = css[0]
if css[0] == None:
return
strMatch = pattern.findall(text)
size = len(strMatch)
# print "size: ",size
for i in range(0,size,1):
one = strMatch[i]
newurl = GetConcatUrl(css[1],one)
DownloadFile(newurl,css[2])
except Exception,e:
pass
# exstr = traceback.format_exc()
# print exstr def Download(url,output):
# try:
header={"User-Agent": "Mozilla-Firefox5.0"}
namespace = "{http://www.w3.org/1999/xhtml}"
request = urllib2.Request(url,None,header)
response = urllib2.urlopen(request) data = response.read()
document = html5lib.parse(data)
imgElements = document.findall('.//{0}img'.format(namespace))
# print "imgElements %d" % len(imgElements)
for img in imgElements:
src = img.attrib["src"]
# print "src %s" % src
try:
res = urlparse(src)
# 非cnblogs的图片不下载
if not res.netloc.endswith(".cnblogs.com"):
print "image not download: %s:%s" % (src,res.netloc)
continue
except Exception,e:
pass
DownloadFile(src,output) linkElements = document.findall('.//{0}link'.format(namespace))
# print "linkElements %d" % len(linkElements)
for link in linkElements:
href = link.attrib["href"]
# print "href %s" % href
text = DownloadFile(href,output)
if link.attrib.has_key("rel") and link.attrib["rel"].lower() == "stylesheet":
ReadCss(text) scriptElements = document.findall('.//{0}script'.format(namespace))
# print "scriptElements %d" % len(scriptElements)
for script in scriptElements:
if script.attrib.has_key("src"):
src = script.attrib["src"]
# print "src %s" % src
DownloadFile(src,output) htmlNameIndex = url.rfind("/");
urlLen = len(url)
htmlName = GetHtmlName(url)
output = output.decode("utf-8") + "/"+htmlName+".htm"
data = data.replace("http://","")
data = data.replace("https://","")
data = data.replace("www.w3.org/1999/xhtml","http://www.w3.org/1999/xhtml") resourceFile = open(output,"wb")
resourceFile.write(data)
resourceFile.close() def GetConcatUrl(url,png):
# one: "../images/f_icon.png" -- url http://static.****.net/public/common/toolbar/css/index.css
count = 0
index = png.find("..")
startindex = None
while index != -1:
count = count + 1;
startindex = index + 2
index = png.find("..",startindex) second = png[startindex:]
length = len(url)
index = url.rfind("/")
endindex = 0
while count >= 0 and index != -1:
endindex = index
index = url.rfind("/",0, endindex)
count = count - 1
first = url[0:endindex]
return first+second def getAllListUrl(url):
header={"User-Agent": "Mozilla-Firefox5.0"}
request = urllib2.Request(url,None,header)
response = urllib2.urlopen(request)
data = response.read() # By default, the document will be an xml.etree element instance.Whenever possible, html5lib chooses the accelerated ElementTreeimplementation (i.e. xml.etree.cElementTree on Python 2.x).
document = html5lib.parse(data)
namespace = "{http://www.w3.org/1999/xhtml}" # get <div id="homepage1_BottomPager" class="topicListFooter">
pageList = document.findall('.//{0}div[@id=\'homepage1_HomePageDays_BottomPager\']'.format(namespace))
print( "Debug>len(pageList)=%d"%len(pageList) );
# get <div class="pager">
alinks = list(pageList[0])
# get content in <div class="pager">, like:<a href="http://www.cnblogs.com/GnagWang/default.html?page=1">
alinks1 = list(alinks[0])
lastArticle = alinks1[len(alinks1)-1] # lastArticleHref = u'http://www.cnblogs.com/GnagWang/default.html?page=20'
lastArticleHref = lastArticle.attrib["href"]
lastPageIndex = lastArticleHref.rfind("=")
lastPageNum = int(lastArticleHref[lastPageIndex+1:])
urlInfo = lastArticleHref[0:lastPageIndex] urlList = []
for x in xrange(1,lastPageNum+1):
listUrl = urlInfo+"="+str(x)
urlList.append(listUrl) return urlList def getArticleList(url):
# 获取所有的文章url
# <div id="article_toplist" class="list"></div>
# <div id="article_list" class="list" # <div class="list_item article_item" # <div class="article_title">
# <span class="ico ico_type_Original"></span>
# <h1>
# <span class="link_title">
# <a href="/infoworld/article/details/18984183"> # <div class="article_manage">
# <span class="link_postdate"></span> urlList = getAllListUrl(url)
print "文章页数(number of pages) ",len(urlList)
header={"User-Agent": "Mozilla-Firefox5.0"} allLists = [] strPage = "分析 第 {0} 页 ".decode("utf-8").encode("utf-8")
pageNum = 0
global gTestTime
for one in urlList:
tryCount = gTestTime # try count
pageNum = pageNum + 1
pageNumStr = strPage.format(pageNum)
print pageNumStr while tryCount > 0:
try:
tryCount = tryCount - 1
time.sleep(0.5) #访问太快会不响应
request = urllib2.Request(one,None,header)
response = urllib2.urlopen(request) data = response.read()
document = html5lib.parse(data,encoding="utf-8")
namespace = "{http://www.w3.org/1999/xhtml}"
# .//{0}div[@id=\'article_toplist\']
#topLists = document.findall('.//{0}div[@id=\'article_toplist\']/{0}div[@class=\'list_item article_item\']'.format(namespace))
#articleLists = document.findall('.//{0}div[@id=\'article_list\']/{0}div[@class=\'list_item article_item\']'.format(namespace))
articleLists = document.findall('.//{0}div[@class=\'postTitle\']'.format(namespace))
allLists = allLists + articleLists
break
except Exception, e:
print "getArticleList %s:%s:%d" % (e,one,tryCount) count = 0 # 文章数
artices = []
for article in allLists:
count = count+1
alink = article.find(".//{0}a".format(namespace))
# href = u'http://www.cnblogs.com/GnagWang/archive/2010/04/02/1702721.html'
href = alink.attrib["href"]
#oneHref = "http://blog.****.net"+href
oneHref = href childElement = list(alink)
linkIter = alink.itertext()
title = "".encode("utf-8")
for x in linkIter:
title = title+x.strip().encode("utf-8")
artices.append([oneHref,title]) return artices def GetUserName(url):
htmlNameIndex = url.rfind("/");
urlLen = len(url)
htmlName = ""
htmlNameIndex1 = url.rfind("/",0,htmlNameIndex)
htmlName = url[htmlNameIndex1+1:htmlNameIndex]
# if htmlNameIndex+1 == urlLen:
# htmlNameIndex = url.rfind("/",0,htmlNameIndex)
# htmlName = url[htmlNameIndex+1:urlLen-1]
# else:
# htmlName = url[htmlNameIndex+1:]
return htmlName def GetHtmlName(url):
htmlNameIndex = url.rfind("/");
urlLen = len(url)
htmlName = ""
if htmlNameIndex+1 == urlLen:
htmlNameIndex = url.rfind("/",0,htmlNameIndex)
htmlName = url[htmlNameIndex+1:urlLen-1]
else:
htmlName = url[htmlNameIndex+1:]
return htmlName #url必须是类似http://www.cnblogs.com/GnagWang/default.html?page=19这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页,则URL建议为前面的URL
def Start(url,output): print "备份开始"
lists = getArticleList(url)
username = GetUserName(url)
output_username = output+"/"+username
output_username.replace("\\","/")
if not os.path.exists(output_username.decode("utf-8")):
os.mkdir(output_username.decode("utf-8")) totalNum = len(lists)
print "总文章数(number of articles): %d" % totalNum # 生成首页文件
doctype = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n'
charset = '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />'
indexHtml = output_username + ".htm"
f = open(indexHtml.decode("utf-8"),"w")
print >> f,doctype
print >> f,'<html>'
print >> f,'<head>'
print >> f,charset
print >> f,'</head>'
print >> f,'<frameset cols=\"20%,*\">'
navigationHtmlName = username+'-navigation.htm'
print >> f,'<frame src=\"'+navigationHtmlName+'\" />'
firstHtmlName = GetHtmlName(lists[0][0])
print >> f,'<frame src=\"'+username+'/'+firstHtmlName+'.htm\" name=\"showframe\">'
print >> f,'</frameset>'
print >> f,'</html>'
f.close() # 生成导航文件
navigationHtml = output+"/"+navigationHtmlName
# f = open(navigationHtml.decode("utf-8"),"w")
f = codecs.open(navigationHtml.decode("utf-8"),"w","utf-8-sig")
print >> f,doctype
print >> f,'<html>'
print >> f,'<head>'
print >> f,charset
print >> f,'<style> body{font: 12px Verdana, Arial, Helvetica, sans-serif;}a{color: #808080;}</style>'
print >> f,'</head>'
print >> f,'<body>'
count = 0
for x in lists:
count = count + 1
articleIdHtml = username+"/"+GetHtmlName(x[0])+".htm"
print >> f,'<a href=\"'+articleIdHtml + '\" target=\"showframe\">'+str(count)+'.'+x[1].decode("utf-8")+'</a><br /><br />'
print >> f,'</body>'
print >> f,'</html>'
f.close() print "开始下载文章"
currentNum = 0
strPage = "{0}:{1}.".decode("utf-8").encode("utf-8")
global gTestTime
for x in lists:
count = gTestTime
currentNum = currentNum+1
while True:
if count < 0:
break
count = count - 1
try:
time.sleep(1) #访问太快,****会报503错误.
strPageTemp = strPage.format(totalNum,currentNum)
strPageTemp = strPageTemp+x[1]
print strPageTemp #这里有时候会不能输出,报output is not utf-8错误,单独执行时 print x[0]
print "\n"
Download(x[0],output_username)
break
except Exception, e:
# exstr = traceback.format_exc()
# print exstr
pass #url必须是类似http://www.cnblogs.com/GnagWang/default.html?page=21这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页,则URL建议为前面的URL
if __name__=='__main__':
url = "http://www.cnblogs.com/yaoyansi/default.html?page=4"
#output = "C:/Users/apple/Desktop/新建文件夹"
output = "/tmp/my_tmp/cnblogs"
Start(url,output)
# Download("http://blog.****.net/dcraw/article/details/6858820",
# "C:/Users/apple/Desktop/新建文件夹/infoworld")

Reference:

[1] http://blog.****.net/llrraa2010/article/details/35540845