先给大家介绍下python3 selenium使用
其实这个就相当于模拟人的点击事件来连续的访问浏览器。如果你玩过王者荣耀的话在2016年一月份的版本里面就有一个bug。
安卓手机下载一个按键精灵就可以在冒险模式里面设置按键,让手机自动玩闯关,一局19个金币,一晚上就一个英雄了。不过
程序员也不是吃素的。给一个星期设置了大概4000金币上限。有兴趣的可以去试试。(注:手机需要root)
进入正题:
1
2
3
4
5
|
from selenium import webdriver
from selenium.webdriver.common.by import by
from selenium.webdriver.common.keys import keys
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.wait import webdriverwait
|
在写之前需要下载selenium模块
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
brguge = webdriver.chrome() #声明驱动对象
try :
brguge.get( 'https://www.baidu.com' ) #发送get请求
input = brguge.find_element_by_id( 'kw' ) #找到目标
input .send_keys( 'python' ) #输入python关键字
input .send_keys(keys.enter) #敲入回车
wait = webdriverwait(brguge, 10 ) #等待元素加载出来
wait.until(ec.presence_of_element_located(by. id , 'content_left' )) #加载
print (brguge.current_url) #输出搜索的路径
print (brguge.get_cookie()) #输出cookie
print (brguge.page_source) #输出结果源代码
finally :
brguge.close() #关闭谷歌浏览器
|
下面是一些selenium模块的基本用法
查找元素
单个元素
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
( from selenium import webdriver)
brguge.find_element_by_id( 'q' )用这个元素找 id 是q的元素
brguge.find_element_by_css_selector( '#q' )找css样式是q的
brguge.find_element_by_xpath( '//*[ @id="q"]' )三个效果一样
brguge.find_element_by_name()通过name来查找
brguge.find_element_by_link_text()通过link来查找
brguge.find_element_by_partial_link_text()
brguge.find_element_by_tag_name()
brguge.find_element_by_class_name()通过 class 查找
from selenium import webdriver
from selenium.webdriver.common.by import by
brguge.find_element(by. id , 'q' )通用查找方式
|
多个元素(find_elements)加了个s
他会以列表的形式打印出来
brguge.find_elements_by_css_selector('.service-bd li')css样式为li的元素
brguge.find_elements(by.css_selector,'.service-bd li')两个作用一样
(利用索引就可以获取单个或多个元素了)
元素交互操作(获取元素然后再给他指令)
选择输入框 --》send_keys('输入文字')--》clear()清空输入框--在输入别的--》找到搜索--》click(点击)
input.clear()清空按钮
交互动作(将动作附加到动作链中串行执行)
switch_to_frame('iframeresult')
用css样式分别找到两个要交互
调用actionchains(调用谷歌的)
drag_and_drop(source,target)第一个到第二个上面
perform()
下面看下python3通过selenium爬虫获取到dj商品的实例代码。
具体代码如下所示:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
from selenium import webdriver
from selenium.webdriver.common.by import by
from selenium.webdriver.common.keys import keys
from selenium.webdriver.support.wait import webdriverwait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.chrome.options import options
from selenium.common.exceptions import nosuchelementexception
from lxml import etree
import time, json
jd_url_login = "https://www.jd.com/"
class customizeexception(exception):
def __init__( self , status, msg):
self .status = status
self .msg = msg
class jd:
def __init__( self ):
self .browser = none
self .__init_browser()
def __init_browser( self ):
options = options()
options.add_argument( "--headless" )
options.add_experimental_option( 'excludeswitches' , [ 'enable-automation' ])
# 设置为无图模式
options.add_experimental_option( "prefs" , { "profile.managed_default_content_settings.images" : 2 })
self .browser = webdriver.chrome(options = options)
# 设置浏览器最大化窗口
self .browser.maximize_window()
# 隐式等待时间为3s
self .browser.implicitly_wait( 3 )
self .browser.get(jd_url_login)
self .wait = webdriverwait( self .browser, 10 )
def __search_goods( self , goods):
'''搜索商品的方法'''
self . file = open ( "jd-{}.json" . format (goods), "a" , encoding = "utf-8" )
self .wait.until(ec.presence_of_all_elements_located((by. id , "key" )))
serach_input = self .browser.find_element_by_id( "key" )
serach_input.clear()
serach_input.send_keys(goods, keys.enter)
def __get_goods_info( self , page_source):
'''从网页源码中获取到想要的数据'''
selector_html = etree.html(page_source)
# 商品名字 不要获取title属性,以后再改吧,最好是获取到商品名的文本内容
goods_name = selector_html.xpath( "//div[@class='gl-i-wrap']//div[contains(@class,'p-name')]/a/@title" )
# 商品价格
goods_price = selector_html.xpath( "//div[@class='gl-i-wrap']//div[@class='p-price']/strong/i/text()" )
# 商品评价数量
comment_num_selector = selector_html.xpath( "//div[@class='p-commit']/strong" )
comment_num = [selector.xpath( "string(.)" ) for selector in comment_num_selector]
# 商品店铺
shop_name = selector_html.xpath( "//a[@class='curr-shop']/text()" )
goods_zip = zip (goods_name, goods_price, comment_num, shop_name)
for goods_info in goods_zip:
dic = {}
dic[ "goods_name" ] = goods_info[ 0 ]
dic[ "goods_price" ] = goods_info[ 1 ]
dic[ "comment_num" ] = goods_info[ 2 ]
dic[ "shop_name" ] = goods_info[ 3 ]
# print("商品名字>>:", goods_info[0])
# print("商品价格>>:", goods_info[1])
# print("商品评价数量>>:", goods_info[2])
# print("商品店铺>>:", goods_info[3])
# print("*" * 100)
yield dic
def __swipe_page( self ):
'''上下滑动页面,将完整的网页源码返回'''
height = self .browser.execute_script( "return document.body.scrollheight;" )
js = "window.scrollto(0, {});" . format (height)
self .browser.execute_script(js)
while true:
time.sleep( 1 )
now_height = self .browser.execute_script( "return document.body.scrollheight;" )
if height = = now_height:
return self .browser.page_source
js = "window.scrollto({}, {});" . format (height, now_height)
self .browser.execute_script(js)
height = now_height
def __is_element_exists( self , xpath):
'''检测一个xpath是否能够找到'''
try :
self .browser.find_element_by_xpath(xpath = xpath)
return true
except nosuchelementexception:
return false
def __click_next_page( self ):
'''点击下一页,实现翻页功能'''
self .wait.until(ec.presence_of_all_elements_located((by.class_name, "pn-next" )))
xpath = "//a[@class='pn-next']"
if not self .__is_element_exists(xpath):
raise customizeexception( 10000 , "该商品访问完毕" )
self .browser.find_element_by_xpath(xpath).click()
def __write_to_json( self , dic: dict ):
data_json = json.dumps(dic, ensure_ascii = false)
self . file .write(data_json + "\n" )
def run( self , goods):
self .__search_goods(goods)
n = 1
while true:
print ( "正在爬取商品 <{}>---第{}页......" . format (goods, n))
time.sleep( 3 )
html = self .__swipe_page()
for dic in self .__get_goods_info(html):
self .__write_to_json(dic)
try :
self .__click_next_page()
except customizeexception:
try :
goods = goods_list.pop( 0 )
self .run(goods)
except indexerror:
return
n + = 1
def __del__( self ):
self .browser.close()
self . file .close()
if __name__ = = '__main__' :
jd = jd()
goods_list = [ "纯牛奶" , "酸奶" , "奶茶" , "床上用品" , "电磁炉" , "电视" , "小米笔记本" , "华硕笔记本" , "联想笔记本" , "男士洗面奶" , "女士洗面奶" , "沐浴露" , "洗发露" ,
"牙刷" , "牙膏" , "拖鞋" , "剃须刀" , "水手服" , "运动服" , "红龙果" , "苹果" , "香蕉" , "洗衣液" , "电饭煲" ]
try :
goods = goods_list.pop( 0 )
except indexerror:
raise customizeexception( 20000 , "goods_list不能为空" )
try :
jd.run(goods)
finally :
del jd
|
总结
以上所述是小编给大家介绍的python3通过selenium爬虫获取到dj商品的实例代码,希望对大家有所帮助,如果大家有任何疑问请给我留言,小编会及时回复大家的。在此也非常感谢大家对服务器之家网站的支持!
如果你觉得本文对你有帮助,欢迎转载,烦请注明出处,谢谢!原文链接:https://www.cnblogs.com/zhuchunyu/archive/2019/04/25/10765875.html