python爬虫某东商品销售数据

时间:2022-11-19 23:00:46

经典四步曲:

1、查找有效URL,分析是静态数据还是动态数据,分析数据的格式。

2、编写python代码,向服务器发送请求,获取数据。

3、解析数据

4、保存数据

import requests
import json
import time
import openpyxl

# headers = {
# "User-Agent": "Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit/537.36(KABUL, like Gecko) "
# "Chrome/86.0.4240.198Safari/537.36 "
# }
# url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=10054692622823&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1'
# resp = requests.get(url=url, headers=headers)
# print(resp.text)

def get_comments(productID, page):
headers = {
"User-Agent": "Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit/537.36(KABUL, like Gecko) "
"Chrome/86.0.4240.198Safari/537.36 "
}
url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId={0}&score=0&sortType=5&page={1}&pageSize=10&isShadowSku=0&fold=1'.format(productID,page)
resp = requests.get(url=url, headers=headers)
# print(resp.text)
s = resp.text.replace('fetchJSON_comment98(', '')
s = s.replace(');', '')
# #将str类型的数据转换成json格式的数据
json_data = json.loads(s)
return json_data

# 获取最大页数
def get_max_page(productID):
dic_data = get_comments(productID, 0)
return dic_data['maxPage']

def get_info(productID):
# 获取最大页数
# max_page = get_max_page(productID)
max_page = 10
list = []
for page in range(1, max_page+1):
comments = get_comments(productID, page)
comm_list = comments['comments']# 根据KEY,获取value
for item in comm_list:
connect = item['content']
color = item['productColor']
size = item['productSize']
score = item['score']
list.append([connect,color,size])
time.sleep(3)
save(list)

def save(list):
wk = openpyxl.Workbook()#创建工作薄对象
sheet = wk.active#获取活动页面
for item in list:#遍历列表
sheet.append(item)
wk.save('商品销售数据.xlsx')#保存在磁盘上

if __name__ == '__main__':
productID = '10025732446506'
# print(get_max_page(productID))
get_info(productID)

备注:避免反爬,只取了前10页数据。后续篇章将分享分析数据(含饼图)