Python爬取各大外包网站需求

时间:2022-11-14 07:55:26


前言

为了更好的掌握数据处理的能力,因而开启Python网络爬虫系列小项目文章。

  • 小项目小需求驱动
  • 总结各种方式
    • 页面源代码返回数据(Xpath、Bs4、PyQuery、正则)
    • 接口返回数据

一、需求

二、分析

一品威客
1、查看网页源代码
2、查找数据
3、获取详情页(赏金、任务要求、需求、状态)

Python爬取各大外包网站需求
Python爬取各大外包网站需求

软件项目交易网
1、查看网页源码
2、全局搜索数据

Python爬取各大外包网站需求
Python爬取各大外包网站需求

获取YesPMP平台需求任务
1、查看网页源代码
2、全局搜索数据

Python爬取各大外包网站需求
Python爬取各大外包网站需求

码市
1、F12抓包即可获取数据
2、构造请求即可获取数据

Python爬取各大外包网站需求

三、处理

一品威客
1、任务页任务
2、详情页(处理直接雇佣)
3、获取赏金、任务要求、时间

# -*- encoding:utf-8 -*-
__author__ = "Nick"
__created_date__ = "2022/11/12"


import requests
from bs4 import BeautifulSoup
import re


HEADERS = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
           "Content-Type": "text/html; charset=utf-8"}


def get_index_source(url):
    res = requests.request("GET",url=url,headers=HEADERS)
    res.encoding = "utf-8"
    return res.text

# 实例化bs4
def method_bs4(html):
    page = BeautifulSoup(html, "html.parser")
    return page



# 直接雇佣任务
def method_zz(code):
    deal = re.compile(r'<meta name="description" content="(?P<is_direct>.*?)" />',re.S)
    result = deal.finditer(code)
    for i in result:
        check = i.group("is_direct")
        if "直接雇佣任务" in check:
            return True


def get_task_url(html):
    page = method_bs4(html)
    # 通过class属性获取页面的任务div
    div = page.select(".title.marginLeft")
    #
    url_list = {}
    for _div in div:
        # 获取url
        content_url = _div.find("a")["href"]
        content = _div.text
        task = content.split("【数据采集】")[1]
        url_list[task] = content_url
    return url_list


def get_task_content(url_dict):
    with open("一品威客任务.txt",mode="a+", encoding="utf-8") as f:
        for name, url in url_dict.items():
            # print(name,url)
            code_source = get_index_source(url)
            page = method_bs4(code_source)
            # 获取赏金
            money = page.select(".nummoney.f_l span")
            for _money in money:
                task_money = _money.text.strip("\n").strip(" ")
                print(task_money)
            # 直接雇佣任务无法查看详情,进行处理
            result = method_zz(code_source)
            if result:
                f.write(f"直接雇佣-{name}{task_money}\n")
            # 获取开始、结束时间
            time = page.select("#TimeCountdown")
            for _time in time:
                start_time = _time["starttime"]
                end_time = _time["endtime"]
                print(start_time,end_time)
            # 获取需求任务
            content = page.select(".task-info-content p")
            for _content in content:
                content_data = _content.text
                print(content_data)
            f.write(f"{name}---{content_data},{task_money},{start_time},{end_time}\n")


if __name__ == '__main__':
    url = "https://task.epwk.com/sjcj/"
    html = get_index_source(url)
    url_dict = get_task_url(html)
    get_task_content(url_dict)

软件项目交易网
通过Xpath即可获取对应数据

# -*- encoding:utf-8 -*-
__author__ = "Nick"
__created_date__ = "2022/11/12"


import requests
from lxml import etree


HEADERS = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
           "Content-Type": "text/html; charset=utf-8"}


def get_index_source(url):
    res = requests.request("GET",url=url,headers=HEADERS)
    res.encoding = "utf-8"
    return res.text

# 实例化etree
def method_xpath(html):
    parse = etree.HTML(html)
    return parse


def get_task_info(html):

    with open("软件交易网站需求.txt",mode="w",encoding="utf-8") as f:
        # 实例化xpath
        parse = method_xpath(html)
        # 通过xpath定位
        result = parse.xpath('//*[@id="projectLists"]/div/ul/li')
        for li in result:
            # 获取任务状态
            status = li.xpath('./div[@class="left_2"]/span/text()')[1]
            # 剔除空格,其它符号
            status = status.strip()
            # 获取任务
            task = li.xpath('./div[@class="left_8"]/h4/a/text()')
            task_content = task[-1].strip()
            # 获取预算
            bond = li.xpath('./div[@class="left_8"]/span[1]/em/text()')[0]
            # 获取人气
            hot = li.xpath('./div[@class="left_8"]/span[2]/em/text()')[0]
            # 发布日期
            start_time = li.xpath('./div[@class="left_8"]/span[3]/em/text()')[0]
            # 截止日期
            end_time = li.xpath('./div[@class="left_8"]/span[4]/em/text()')[0]
            f.write(f"{status},{task_content},{bond},{hot},{start_time},{end_time}\n")




if __name__ == '__main__':
    url = "https://www.sxsoft.com/page/project"
    html = get_index_source(url)
    get_task_info(html)

获取YesPMP平台需求任务
通过PQuery即可获取数据

# -*- encoding:utf-8 -*-
__author__ = "Nick"
__created_date__ = "2022/11/12"


import requests
from pyquery import PyQuery as pq


HEADERS = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
           "Content-Type": "text/html; charset=utf-8"}


def get_index_source(url):
    res = requests.request("GET",url=url,headers=HEADERS)
    res.encoding = "utf-8"
    return res.text

# 实例化pq
def method_pq(html):
    parse = pq(html)
    return parse


def get_task_info(html):
    with open("yespmp网站需求.txt",mode="a",encoding="utf-8") as f:
        parse = method_pq(html)
        # 通过class属性进行定位
        result =parse.find(".promain")
        # print(result)
        for _ in result.items():
            # 任务名称
            task_name = _.find(".name").text()
            # 赏金
            price =  _.find(".price").text()
            # 项目周期
            date = _.find(".date").text()
            # 竞标人数
            bid_num =  _.find(".num").text()
            f.write(f"{task_name},{price},{date},{bid_num}\n")


if __name__ == '__main__':
    for i in range(2,10):
        url = f"https://www.yespmp.com/project/index_i{i}.html"
        html = get_index_source(url)
        get_task_info(html)

码市
基本request请求操作(请求头、参数)

# -*- encoding:utf-8 -*-
__author__ = "Nick"
__created_date__ = "2022/11/12"


import requests
import json

headers = {
        'cookie': 'mid=6c15e915-d258-41fc-93d9-939a767006da; JSESSIONID=1hfpjvpxsef73sbjoak5g5ehi; _gid=GA1.2.846977299.1668222244; _hjSessionUser_2257705=eyJpZCI6ImI3YzVkMTc5LWM3ZDktNTVmNS04NGZkLTY0YzUxNGY3Mzk5YyIsImNyZWF0ZWQiOjE2NjgyMjIyNDM0NzgsImV4aXN0aW5nIjp0cnVlfQ==; _ga_991F75Z0FG=GS1.1.1668245580.3.1.1668245580.0.0.0; _ga=GA1.2.157466615.1668222243; _gat=1',
        'referer': 'https://codemart.com/projects?labelId=&page=1',
        'accept': 'application/json',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
    }


def get_data():
    url = "https://codemart.com/api/project?labelId=&page=1"
    payload = {}
    response = requests.request("GET", url, headers=headers, data=payload)
    print(json.loads(response.text))


if __name__ == '__main__':
    get_data()

四、总结

  • Xpath
    • 适用于要获取的信息在某个标签下,且各标签层次明显,通过路径找到位置,for循环遍历即可
  • Bs4
    • 适用于要获取的信息比较分散,且通过选择器可以定位(class唯一、id唯一)
  • PyQuery
    • 适用于要获取的信息比较分散,且通过选择器可以定位(class唯一、id唯一)
  • 正则
    • 通过(.*?)就可以处理元素失效或者定位少量信息
    • 不适用网页代码有很多其它符号,定位失效
  • 接口返回数据
    • 对于接口没有进行加密,通过requests构造请求即可获取数据
    • 关注点在请求头中的参数

欢迎加入免费的知识星球内!
我正在「Print(“Hello Python”)」和朋友们讨论有趣的话题,你⼀起来吧?
https://t.zsxq.com/076uG3kOn