Python——python读取html实战，作业7（python programming）

【转】Python——读取html的table内容

查看源码，观察html结构

【转】Python——读取html的table内容

# -*- coding: utf-8 -*-

from lxml.html import parse

from urllib.request import urlopen

import pandas as pd

# 可能爬的这个网页比较特殊，需要写下面两句话

import ssl

ssl._create_default_https_context = ssl._create_unverified_context

# 根据链接获得整个html放到doc中

parsed = parse(urlopen('https://info.zufe.edu.cn/xygk/szdw.htm'))

doc = parsed.getroot()

#读取html中的table

# 用列表来存老师名字

all_teachers=[]

# 用字典保存主页链接

link_dic={}

# 用字典保存职称

zhicheng={}

# 找到html中有<table></table>的所有table，以列表的形式返回给tables

tables = doc.findall('.//table')

# 我们要的是第一个table

content=tables[0].text_content()

tds = tables[0].findall('.//td')

# 一条条遍历所有td里的内容

for td in tds:

    # 判断当前属于哪个职称，再给zc赋值

    zhi=td.findall('.//strong')

    if len(zhi)!=0:

        print(zhi[0].text_content())

        zc=zhi[0].text_content()

    print(td.text_content())

    link=td.findall('.//a')

    if len(link)!=0:

        print("link",link[0].get('href'))

        # td.text_content()存的就是姓名

        # 保存链接

        link_dic[td.text_content()]=link[0].get('href')

        # 保存老师姓名

        all_teachers.append(str(td.text_content()))

        # 保存职称

        zhicheng[td.text_content()]=zc

print("张 帅的主页链接是：",link_dic["张 帅"])

print("张 帅的职称链接是：",zhicheng["张 帅"])

# 后面的各系不属于老师去掉

all_teachers=all_teachers[:68]

#姓名，职称，主页链接整理成dataframe

dataframe={"姓名":[],

           "职称":[],

           "主页链接":[]}

for teacher in all_teachers:

    dataframe["姓名"].append(teacher)

    dataframe["职称"].append(zhicheng[teacher])

    dataframe["主页链接"].append(link_dic[teacher])

dataframe=pd.DataFrame(dataframe)

print(dataframe)

【转】Python——读取html的table内容

秒客网

【转】Python——读取html的table内容

Python——python读取html实战，作业7（python programming）

相关文章