Python 爬虫实例(9)—— 搜索 爬取 淘宝

时间:2022-11-08 04:46:22
# coding:utf-

import json
import redis
import time
import requests
session = requests.session()
import logging.handlers
import pickle
import sys
import re
import datetime
from bs4 import BeautifulSoup import sys
reload(sys)
sys.setdefaultencoding('utf8') import datetime
# 生成一年的日期
def dateRange(start, end, step=, format="%Y-%m-%d"):
strptime, strftime = datetime.datetime.strptime, datetime.datetime.strftime
days = (strptime(end, format) - strptime(start, format)).days
return [strftime(strptime(start, format) + datetime.timedelta(i), format) for i in xrange(, days, step)] def spider():
from selenium import webdriver
import os
# 引入chromedriver.exe
chromedriver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"
os.environ["webdriver.chrome.driver"] = chromedriver
browser = webdriver.Chrome(chromedriver) # 设置浏览器需要打开的url
url = "https://www.taobao.com/"
browser.get(url)
time.sleep() browser.find_element_by_id("q").send_keys(u'python')
browser.find_element_by_class_name("btn-search").click()
time.sleep() for i in range(,):
browser.find_element_by_xpath('//a[@trace="srp_bottom_pagedown"]').click()
time.sleep() result = browser.page_source result_replace = str(result).replace('\n','').replace('\r','').replace('\t','').replace(' ','') result_replace = re.findall('<divclass="pic-boxJ_MouseEneterLeaveJ_PicBox">(.*?)</div><divclass="ctx-boxJ_MouseEneterLeaveJ_IconMoreNew">(.*?)</div><divclass="rowrow-4g-clearfix">(.*?)</div></div></div>',result_replace) print len(result_replace) for item in result_replace: item_imgurl = re.findall('data-src="(.*?)"alt=',item[])[]
item_name = re.findall('alt="(.*?)"/></a></div><divclass=',item[])[] item_loation = re.findall('<divclass="location">(.*?)</div>',item[])[] company_name = re.findall('</span></span><span>(.*?)</span></a></div><divclass="location">',item[])[] company_price = re.findall('<divclass="priceg_priceg_price-highlight"><span>¥</span><strong>(.*?)</strong></div>',item[])[] purchase_num = re.findall('<divclass="deal-cnt">(.*?)人付款</div>',item[])[] print item_imgurl
print item_name
print item_loation
print company_name print company_price
print purchase_num
print "="* # time.sleep() # 关闭浏览器
# browser.quit() spider()