目标任务:使用多进程下载金庸网各个版本(旧版、修订版、新修版)的小说
代码如下:
# -*- coding: utf-8 -*-
import requests
from lxml import etree
from multiprocessing import Pool
import os
import sys
reload(sys)
sys.setdefaultencoding('utf-8') headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'} def download(title,url, filename):
response = requests.get(url, headers=headers).text
html = etree.HTML(response)
pages = html.xpath('//div//p/text()')[2:]
with open(filename, 'a') as f:
f.write(title+'\n')
for page in pages:
with open(filename, 'a') as f:
f.write(page+'\n') def main(url):
start_url = 'http://www.jinyongwang.com'+url
sname = start_url.split('/')[-2]
if sname.startswith('o'):
folder = 'old/'
if(not os.path.exists(folder)):
os.makedirs(folder)
elif sname.startswith('n'):
folder = 'new/'
if(not os.path.exists(folder)):
os.makedirs(folder)
else:
folder = 'now/'
if(not os.path.exists(folder)):
os.makedirs(folder)
filename = folder+sname+'.txt'
base_url = 'http://www.jinyongwang.com'
response = requests.get(start_url, headers=headers).text
html = etree.HTML(response)
urls = html.xpath('//ul[@class="mlist"]/li/a/@href')
titles = html.xpath('//ul[@class="mlist"]/li//text()')
for index,url in enumerate(urls):
full_url = base_url+url
title = titles[index]
download(title, full_url, filename) if __name__ == '__main__':
url01 = 'http://www.jinyongwang.com/'
response = requests.get(url01, headers=headers).text
html = etree.HTML(response)
urls = html.xpath('//li[@class="book_li"]/p[3]//a/@href')
pool = Pool()
pool.map(main,urls)
pool.close()
pool.join()
结果展示: