Python使用Selenium模块实现模拟浏览器抓取淘宝商品美食信息功能示例

时间:2022-08-23 17:39:08

本文实例讲述了Python使用Selenium模块实现模拟浏览器抓取淘宝商品美食信息功能。分享给大家供大家参考,具体如下:

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
from pymongo import MongoClient
from pymongo.errors import PyMongoError
url = 'http://www.taobao.com'
KEYWORD = '美食'
# monogdb配置信息
MONGO_HOST = "localhost"
MONGO_DATABASE = "taobao"
MONGO_TABLE = "meishi"
client = MongoClient(host=MONGO_HOST)
db = client[MONGO_DATABASE]
# PhantomJS 命令行相关配置
# 参见 http://phantomjs.org/api/command-line.html
SERVICE_ARGS = ['--disk-cache=true', '--load-images=false']
# driver = webdriver.Chrome() # 有界面
driver = webdriver.PhantomJS(service_args=SERVICE_ARGS) # *面
delay = 10
wait = WebDriverWait(driver, delay)
# print('windows size', driver.get_window_size())
# PhantomJs()的浏览器窗口很小,宽高只有400 * 300
driver.maximize_window() # 窗口最大化 # 对于PhantomJS来说设置窗口大小很关键,如果不设置,经常会出现问题
# driver.set_window_size(1920, 1080) # 设置浏览器窗口大小
# 模拟在淘宝网页中输入关键字搜索
def search():
  print("准备搜索 %s" % KEYWORD)
  try:
    driver.get(url)
    input_box = wait.until(
      EC.presence_of_element_located((By.CSS_SELECTOR, "input#q"))
    )
    search_btn = wait.until(EC.element_to_be_clickable(
      (By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))
    input_box.send_keys(KEYWORD)
    search_btn.click()
    total_page_str = wait.until(
      EC.presence_of_element_located(
        (By.CSS_SELECTOR, 'div.total'))).text
    total_page_num = int(re.search("(\d+)", total_page_str).group(1))
    item_list = get_goods_by_beautifulsoup()
    save_to_mongodb(item_list)
    return total_page_num
  except TimeoutError:
    print("搜索%s超时", KEYWORD)
    print("重新尝试搜索: %s", KEYWORD)
    search()
# 根据页码获取指定页数据,并将其保存到数据库中
def get_page(page_num):
  print("正在获取第%d页数据" % page_num)
  try:
    page_num_box = wait.until(
      EC.presence_of_element_located(
        (By.CSS_SELECTOR, "div.form > input")))
    ok_btn = wait.until(EC.element_to_be_clickable(
      (By.CSS_SELECTOR, 'div.form > span.btn.J_Submit')))
    page_num_box.clear()
    page_num_box.send_keys(page_num)
    ok_btn.click()
    wait.until(
      EC.text_to_be_present_in_element(
        (By.CSS_SELECTOR,
         'li.item.active > span.num'),
        str(page_num)))
    item_list = get_goods_by_beautifulsoup()
    save_to_mongodb(item_list)
  except TimeoutException:
    print("请求第%d页失败" % page_num)
    print("尝试重新获取第%d页" % page_num)
    return get_page(page_num)
def get_goods_by_pyquery():
  '''
  通过pyquery库解析数据
  获取商品的图片url、价格、标题、成交量、店铺名称、店铺位置
  '''
  wait.until(EC.presence_of_element_located(
    (By.CSS_SELECTOR, "#mainsrp-itemlist .items .item")))
  html = driver.page_source
  doc = pq(html)
  items = list(doc('#mainsrp-itemlist .items .item').items())
  for item in items:
    yield {
      # 不要用src属性,获取的图片地址很多是.gif图片,而非真实商品图片,
      'image': 'http://' + item.find('.J_ItemPic.img').attr('data-src'),
      'price': item.find('.price').text(),
      'title': item.find('.row > .J_ClickStat').text().strip(),
      'deal_cnt': item.find('.deal-cnt').text()[:-3],
      'shop': item.find('.shop').text(),
      'location': item.find('.location').text(),
    }
# 通过bs4解析数据
def get_goods_by_beautifulsoup():
  '''
  通过bs4库解析数据
  获取商品的图片url、价格、标题、成交量、店铺名称、店铺位置
  '''
  wait.until(EC.presence_of_element_located(
    (By.CSS_SELECTOR, "#mainsrp-itemlist .items .item")))
  html = driver.page_source
  soup = BeautifulSoup(html, 'lxml')
  items = soup.select('#mainsrp-itemlist .items .item')
  for item in items:
    yield {
      'image': 'http:' + item.select('img.J_ItemPic.img')[0]['data-src'],
      'price': item.select('div.price.g_price.g_price-highlight')[0].get_text(strip=True),
      'title': item.select('div.row.row-2.title > a.J_ClickStat')[0].get_text(strip=True),
      'deal_cnt': item.select('div.deal-cnt')[0].text[:-3],
      'shop': item.select('div.shop > a')[0].get_text(strip=True),
      'location': item.select('div.location')[0].text,
    }
def save_to_mongodb(item_list):
  for item in item_list:
    try:
      db[MONGO_TABLE].insert(item)  # insert支持插入多条数据
      print("mongodb插入数据成功:", item)
    except PyMongoError as e:
      print("mongodb插入数据失败:", item, e)
# 获取淘宝美食的图片url、价格、标题、成交量、店铺名称、店铺位置并将结果保存在mongodb数据库中
if __name__ == '__main__':
  try:
    total_pages = search()
    for page_num in range(2, total_pages + 1):
      get_page(page_num)
  except Exception as e:
    print("出错了", e)
  finally: # 确保 浏览器能正常关闭
    driver.close()

备注:

PhantomJS*面浏览器打开的窗口默认大小400*300, 往往不能将网页加载完全,会给提取数据造成很大的困难,因此需要指定窗口大小。

可以使用 maximize_window() 最大化窗口或者set_window_size()设置指定大小

可能会出现的异常:

raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
Screenshot: available via screen

希望本文所述对大家Python程序设计有所帮助。

原文链接:http://www.cnblogs.com/hupeng1234/p/7117335.html