老司机带你30行代码爬糗事百科成人版
import urllib
import re
import requests
from import RequestException
#这里是我们要爬的网址,为了示例只爬取20页
for j in range(1, 20):
url = '/' + str(j) + '.html'
#得到网页源代码
def get_page_index(url):
try:
response=(url)
if response.status_code==200:
return ('gbk')
else:
return None
except RequestException:
print('its error')
return None
def download_img(html):
#这里使用正则匹配出我们要拿到图片的网址
pattern = ('<img alt=.*? src="(.*?)".*? />', )
items = (pattern, html)
x=0
for item in items:
print('正在下载中....')
bytes = (item)
f = open("f:/qiushibaike/" + str(x) + '.jpg', 'wb')
()
x = x + 1
def main():
html=get_page_index(url)
download_img(html)
if __name__=='__main__':
main()