使用Pyquery+selenium抓取淘宝商品信息

时间:2023-03-09 01:08:52
使用Pyquery+selenium抓取淘宝商品信息

配置文件,配置好数据库名称,表名称,要搜索的产品类目,要爬取的页数

MONGO_URL = 'localhost'
MONGO_DB = 'taobao'
MONGO_TABLE = 'phone' SERVICE_ARGS = [
'--disk-cache=true', # 在phantomjs时使用缓存
'--load-images=false' # 使用phantomjs时不加载出图片
] KEYWORD = '手机'
MAXPAGE = 5

主程序

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2018-06-14 22:02:26
# @Author : Chenjun (320316430@qq.com;)
# @Link : http://example.org
# @Version : $Id$
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from pyquery import PyQuery as pq
from config import *
import pymongo #使用mongodb数据库存储,在此python提供pymongo库方便使用 client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) #使用phantomjs*面浏览器,在爬虫抓取时更方便,并且提供api配置
browser.set_window_size(1400, 900)
wait = WebDriverWait(browser, 10) #设置等待时长等待信息加载出来

#拿到所有的商品信息
def search():
print('正在搜索...')
try:
browser.get('https://www.taobao.com')
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#q')) #等待输入框加载出来并插入光标
)
submit = wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button'))) #等待搜索兼可被点击
input.send_keys(KEYWORD) #模拟用户输入
submit.click() #模拟用户点击
get_products()
total = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total'))) #获取搜索结果总页数
return total.text
except TimeoutException:
return search() def next_page(page_number):
print('正在翻页...')
try:
input = wait.until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input')) #等待输入页码框加载出来并插入光标
)
submit = wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))) #等待跳转按钮可以被点击
input.clear() #清除当前页码
input.send_keys(page_number) #模拟输入新页码
submit.click() #模拟点击
wait.until(EC.text_to_be_present_in_element(
(By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number))) #等到网页跳转到输入的页面
get_products()
except TimeoutException:
next_page(page_number) #拿到具体商品信息
def get_products():
wait.until(EC.presence_of_element_located((
By.CSS_SELECTOR, '#mainsrp-itemlist .items .item'))) #等待商品被加载出来
html = browser.page_source #拿到当前页面dom文档
doc = pq(html)
items = doc('#mainsrp-itemlist .items .item').items()
count = 0
for item in items:
count += 1
product = { #pyquery解析文档
'image': item.find('.pic .img').attr('src'),
'price': item.find('.price').text(),
'deal': item.find('.deal-cnt').text()[:-3],
'title': item.find('.title').text(),
'shop': item.find('.shop').text(),
'location': item.find('.location').text()
}
save_to_mongo(product, count)
print(type(items), type(item)) def save_to_mongo(result, count):
try:
if db[MONGO_TABLE].insert(result): #存储到mongodb
print(f'存储{count}到了MONGODB成功')
except Exception:
print('存储失败') def main():
try:
total = search()
total = int(re.compile('(\d+)').search(total).group(1))
if total >= MAXPAGE:
total = MAXPAGE
for i in range(2, total + 1):
next_page(i)
except Exception:
print('出错啦!')
finally:
browser.close() #无论成败,记得关闭浏览器 if __name__ == '__main__':
main()