笔趣阁小说 selenium爬取

时间:2023-03-09 19:33:01
笔趣阁小说 selenium爬取
import re
from time import sleep from lxml import etree
from selenium import webdriver options = webdriver.ChromeOptions()
#options.add_argument('--headless')
options.add_argument(
"User-Agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36")
options.add_argument("Referer=https://s.weibo.com/")
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('blink-settings=imagesEnabled=false')
options.add_argument('--disable-gpu')
options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
options.add_argument(
'Cookie: ') class Qidian:
def __init__(self, url, driver):
self.url = url
self.driver = driver
content = self.get_content(url)
self.file_name = self.pase_file_name(content) def crawl_start(self):
content = self.get_content(self.url)
self.parse_detail(content) def get_content(self,url):
self.driver.get(url)
content = driver.page_source
return content def pase_file_name(self, content):
html = etree.HTML(content)
file_info = html.xpath('//*[@id="info"]/h1/text()')
file_name = file_info[0] + ".txt"
return file_name def parse_detail(self, content):
html = etree.HTML(content)
ul = html.xpath('//div[@id="list"]/dl//dd')
open(self.file_name, 'w')
for li in ul:
item = {}
title = li.xpath('./a/text()')
href = li.xpath('./a/@href') item['title'] = title[0]
item['href'] = "http://www.biquge.info/0_273/" + href[0]
print(item)
driver.get(item['href'])
html = etree.HTML(driver.page_source)
details = html.xpath('//*[@id="content"]//text()')
detail = ''.join(details) self.save_to_file(self.file_name, title[0], detail)
sleep(3) def save_to_file(self, file_name, title, content):
with open(file_name, 'a+') as f:
f.write(title + '\n')
f.write(content)
f.write('\n')
f.close() if __name__ == "__main__":
url = "http://www.biquge.info/0_273/"
driver = webdriver.Chrome(options=options)
try:
qidian = Qidian(url, driver)
qidian.crawl_start()
driver.quit()
except Exception as e:
print(str(e))

  笔趣阁小说 selenium爬取