利用Python爬取8684公交路线查询网站中全国公交站点信息

时间:2024-03-04 18:35:55

利用python语言结合requests、BeautifulSoup等类库爬取https://api.8684.cn/v3/api.php?do=citys&act=province对应接口中所有城市公交路线信息以及公交站点信息。

import time
import requests
import json, re
from bs4 import BeautifulSoup


# 定义一个函数,传入线路名称相当于在高德地图搜索,来获取每趟公交的站点名称和经纬度
def get_city():
    """

    :rtype: object
    """
    city_url = 'https://api.8684.cn/v3/api.php?do=citys&act=province'
    city_data = requests.get(city_url).text
    print(city_data)
    city_res = json.loads(city_data)
    # print(city_res['stations'])
    for province in range(0, len(city_res['stations'])):
        for city in range(0, len(city_res['stations'][province]['childs'])):
            # print(city_res['stations'][province]['childs'][city])
            city_py = city_res['stations'][province]['childs'][city]['e']
            city_name = city_res['stations'][province]['childs'][city]['c']
            if city_name in ('广州'):
                for k in range(1, 200):
                    url = 'https://{}.8684.cn/line{}'.format(
                        city_py, k)  # 今天就只先演示获取一种线路类型下所有公交的信息,要想拿到整个城市的,其实就加个for循环:line1,line2,line3......
                    # 伪装请求头
                    headers = {
                        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
                    }
                    # 通过requests模块模拟get请求
                    res = requests.get(url=url, headers=headers)
                    soup = BeautifulSoup(res.text, "lxml")
                    div = soup.find('div', class_='list clearfix')
                    if div is not None:
                        lists = div.find_all('a')
                        for item in lists:
                            lines = item.text  # 获取a标签下的公交线路
                            line = re.split(r' |\(', lines)[0]
                            print(lines, "++++++++++++++++++++++++++++++++++",line )
                            res_dir = 'E:\全国公交站点信息数据\\'
                            url_api = 'https://restapi.amap.com/v3/bus/linename?s=rsv3&extensions=all&key=“替换高德创作平台个人Key”&output=json&city={}&offset=2&keywords={}&platform=JS'.format(
                                city_name, line)
                            while requests.get(url_api).status_code != 200:
                                print(url_api)
                                res = requests.get(url_api).text
                                time.sleep(2)
                                # print(res) #可以用于检验传回的信息里面是否有自己需要的数据
                                rt = json.loads(res)
                                if 'buslines' in rt:
                                    if len(rt['buslines']) >= 1:
                                        for i in range(0, len(rt['buslines'])):
                                            line_id = rt['buslines'][i]['id']
                                            line_type = rt['buslines'][i]['type']
                                            line_name = rt['buslines'][i]['name']
                                            polyline = rt['buslines'][i]['polyline']
                                            city_code = rt['buslines'][i]['citycode']
                                            start_stop = rt['buslines'][i]['start_stop']
                                            end_stop = rt['buslines'][i]['end_stop']
                                            start_time = rt['buslines'][i]['start_time']
                                            end_time = rt['buslines'][i]['end_time']
                                            status = rt['buslines'][i]['status']
                                            company = rt['buslines'][i]['company']
                                            info = (str(line_id) + '\u0001' + str(line_type) + '\u0001' + str(
                                                line_name) + '\u0001' +
                                                    str(polyline) + '\u0001' + str(city_code) + '\u0001' + str(
                                                        start_stop) + '\u0001' +
                                                    str(end_stop) + '\u0001' + str(start_time) + '\u0001' + str(end_time) + '\u0001'
                                                    + str(status) + '\u0001' + str(company))
                                            print(info)
                                            output_dir = res_dir + str(city_name) + '公交导航信息数据.txt'
                                            with open(output_dir, 'a', encoding="utf-8") as file:
                                                file.write(info + "\n")
                                            stop = rt['buslines'][i]['busstops']
                                            for j in range(len(stop)):
                                                station_id = stop[j]['id']
                                                station = stop[j]['name']
                                                location = stop[j]['location']
                                                sequence = stop[j]['sequence']
                                                info_ = (str(station_id) + '\u0001' + str(line) + '\u0001' + str(
                                                    station) + '\u0001' +
                                                         str(location) + '\u0001' + str(sequence))
                                                print(info_)
                                                output_dir = res_dir + str(city_name) + '公交站点信息数据.txt'
                                                with open(output_dir, 'a', encoding="utf-8") as file:
                                                    file.write(info_ + "\n")
                                                j += 1
                                    else:
                                        pass
                                else:
                                    pass
                            else:
                                print(url_api)
                                res = requests.get(url_api).text
                                time.sleep(2)
                                # print(res) #可以用于检验传回的信息里面是否有自己需要的数据
                                rt = json.loads(res)
                                if 'buslines' in rt:
                                    if len(rt['buslines']) >= 1:
                                        for i in range(0, len(rt['buslines'])):
                                            line_id = rt['buslines'][i]['id']
                                            line_type = rt['buslines'][i]['type']
                                            line_name = rt['buslines'][i]['name']
                                            polyline = rt['buslines'][i]['polyline']
                                            city_code = rt['buslines'][i]['citycode']
                                            start_stop = rt['buslines'][i]['start_stop']
                                            end_stop = rt['buslines'][i]['end_stop']
                                            start_time = rt['buslines'][i]['start_time']
                                            end_time = rt['buslines'][i]['end_time']
                                            status = rt['buslines'][i]['status']
                                            company = rt['buslines'][i]['company']
                                            info = (str(line_id) + '\u0001' + str(line_type) + '\u0001' + str(
                                                line_name) + '\u0001' +
                                                    str(polyline) + '\u0001' + str(city_code) + '\u0001' + str(
                                                        start_stop) + '\u0001' +
                                                    str(end_stop) + '\u0001' + str(start_time) + '\u0001' + str(
                                                        end_time) + '\u0001'
                                                    + str(status) + '\u0001' + str(company))
                                            print(info)
                                            output_dir = res_dir + str(city_name) + '公交导航信息数据.txt'
                                            with open(output_dir, 'a', encoding="utf-8") as file:
                                                file.write(info + "\n")
                                            stop = rt['buslines'][i]['busstops']
                                            for j in range(len(stop)):
                                                station_id = stop[j]['id']
                                                station = stop[j]['name']
                                                location = stop[j]['location']
                                                sequence = stop[j]['sequence']
                                                info_ = (str(station_id) + '\u0001' + str(line) + '\u0001' + str(
                                                    station) + '\u0001' +
                                                         str(location) + '\u0001' + str(sequence))
                                                print(info_)
                                                output_dir = res_dir + str(city_name) + '公交站点信息数据.txt'
                                                with open(output_dir, 'a', encoding="utf-8") as file:
                                                    file.write(info_ + "\n")
                                                j += 1
                                    else:
                                        pass
                                else:
                                    pass
                    else:
                        pass
            else:
                pass


if __name__ == '__main__':
    result = get_city()



在结合文件写入等操作将采集到的站点信息以及导航信息保存至对应城市的文件中

数据样例展示, 分隔符为