Requests+正则表达式 爬取猫眼电影

时间:2023-03-09 16:59:10
Requests+正则表达式 爬取猫眼电影

代码:

import re
import json
from multiprocessing import Pool
import requests
from requests.exceptions import RequestException basic_url = 'http://maoyan.com/board/4?offset=%d'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
}
file = open("maoyan_movies.txt", 'a', encoding="utf-8") def get_page(url):
try:
response = requests.get(url, headers=headers)
if response.status_code == requests.codes.ok:
return response.text
else:
return None
except RequestException:
return None def parse_page(content):
pattern = re.compile(
'<dd>.*?board-index.*?>(\d+)</i>'
'.*?<img data-src="(.*?)"'
'.*?class="name"><a.*?>(.*?)</a>'
'.*?class="star">(.*?)</p>'
'.*?class="releasetime">(.*?)</p>'
'.*?class="score"><i class="integer">(.*?)</i><i class="fraction">(.*?)</i>'
'.*?</dd>', re.S)
items = pattern.findall(content)
for item in items:
yield {
'id': item[0],
'image': item[1],
'name': item[2].strip(),
'actor': item[3].strip()[3:],
'releasetime': item[4][5:],
'score': item[5] + item[6],
} def save_to_file(content):
json.dump(content, file, ensure_ascii=False)
file.write('\n') def get_page_movies(offset):
'''
获取一页的电影信息
offset用来构建完整的网页url,以10为最小单位
'''
step = 10
url = basic_url % (step * offset)
html = get_page(url)
for movie_info in parse_page(html):
save_to_file(movie_info) # 获取猫眼电影top100的电影信息: 排名,图片url,电影名,主演,上映日期,评分
def get_top_100_movies():
offset_list = [i for i in range(10)]
pool = Pool(processes=4)
pool.map(get_page_movies, offset_list)
pool.close()
pool.join() if __name__ == "__main__":
get_top_100_movies()