python解析网页数据BeautifulSoup和xpath

时间:2022-12-05 21:57:31

前言:requests请求并获取数据后,解析数据通常用两种方法(BeautifulSoup和xpath),下面以某房chan数据有例子,分别使用不同的方法解析数据。

一、xpath方法:

from lxml import etree

e = etree.HTML(resp.text)
names = [n.strip() for n in e.xpath("//div[@class='nlcd_name']/a/text()")]

二、BeautifulSoup方法:

from bs4 import BeautifulSoup

bs = BeautifulSoup(resp.text, 'html.parser')
nl_con = bs.find("div", class_='nl_con clearfix')
li_list = nl_con.find_all("li")
lst = []
for item in li_list:
names = item.find('div', class_="nlcd_name")


个人比较喜欢用BeautifulSoup方法,一是BeautifulSoup方法接触比较早,而且BeautifulSoup方法可以跟re方法结合使用。非常灵活方便。


详细实例:

import requests
from lxml import etree
import pandas

headers = {
"User-Agent": "Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit/537.36(KABUL, like Gecko) "
"Chrome/86.0.4240.198Safari/537.36 "
}
for i in range(1, 2):
url = f"https://newhouse.fang.com/house/s/b9{i}"
resp = requests.get(url, headers=headers)
resp.encoding = resp.apparent_encoding
e = etree.HTML(resp.text)
names = [n.strip() for n in e.xpath("//div[@class='nlcd_name']/a/text()")]
address = e.xpath("//div[@class='address']/a/@title")
prices = [d.xpath('string(.)').strip() for d in e.xpath("//div[@class='nhouse_price']")]
fangyuan = [n.strip() for n in e.xpath("//div[@class='fangyuan']/span/text()")]
data = []
for n, a, p, f in zip(names, address, prices, fangyuan):
data.append([n, a, p ,f])

for i in data:
print(i)
df = pandas.DataFrame(data, columns=['小区名', '地址', '单价', '是否在售'])
print(df)

python解析网页数据BeautifulSoup和xpath

python解析网页数据BeautifulSoup和xpath



import requests
from bs4 import BeautifulSoup
import re
import pandas

headers = {
"User-Agent": "Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit/537.36(KABUL, like Gecko) "
"Chrome/86.0.4240.198Safari/537.36 "
}
for i in range(1, 2):
url = f"https://newhouse.fang.com/house/s/b9{i}"
resp = requests.get(url, headers=headers)
resp.encoding = resp.apparent_encoding
bs = BeautifulSoup(resp.text, 'html.parser')
nl_con = bs.find("div", class_='nl_con clearfix')
li_list = nl_con.find_all("li")
lst = []
for item in li_list:
names = item.find('div', class_="nlcd_name")
names = names.find('a').text.strip()
address = item.find('div', class_="address")
address = address.find('a')['title']
prices = item.find('div', class_="nhouse_price").text.strip()
house_tyep = item.find('div', class_='house_type clearfix').text.replace('\n', ' ').replace('\r', '').replace(" ", "")
house_tyep = re.sub(r'[\s]+', '', house_tyep)
fangyuan = item.find('div', class_='fangyuan').text.replace('\n', ' ').replace('\r', '').replace(" ", "")
fangyuan = re.sub(r'[\s]+', '', fangyuan)
# print(names, address, prices, house_tyep, fangyuan)
# break
lst.append([names, address, prices, house_tyep, fangyuan])


# for i in lst:
# print(i)

df = pandas.DataFrame(lst, columns=['小区名', '地址', '户型信息', '单价', '是否在售'])
print(df)

python解析网页数据BeautifulSoup和xpath

python解析网页数据BeautifulSoup和xpath