Python爬虫 | IP池的使用

时间:2024-01-30 16:36:42

一、简介

- 爬虫中为什么需要使用代理

  一些网站会有相应的反爬虫措施,例如很多网站会检测某一段时间某个IP的访问次数,如果访问频率太快以至于看起来不像正常访客,它可能就会禁止这个IP的访问。所以我们需要设置一些代理IP,每隔一段时间换一个代理IP,就算IP被禁止,依然可以换个IP继续爬取。

 

- 代理的分类:

  正向代理:代理客户端获取数据。正向代理是为了保护客户端防止被追究责任。

  反向代理:代理服务器提供数据。反向代理是为了保护服务器或负责负载均衡。

 

- 免费代理ip提供网站

  http://www.goubanjia.com/

  西刺代理

  快代理

 

匿名度

  - 透明:知道是代理ip,也会知道你的真实ip

  - 匿名:知道是代理ip,不会知道你的真实ip

  - 高匿:不知道是代理ip,不会知道你的真实ip

 

类型:

  - http:只能请求http开头的url

  - https:只能请求https开头的url

 

示例

import requests


headers = {
     \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36\'
}
url = \'https://www.baidu.com/s?wd=ip\'

# 不同的代理IP,代理ip的类型必须和请求url的协议头保持一致
proxy_list = [
     {"http": "112.115.57.20:3128"},        
     {\'http\': \'121.41.171.223:3128\'}
]

# 随机获取代理IP
proxy = random.choice(proxy_list)

page_text = requests.get(url=url,headers=headers,proxies=proxy).text

with open(\'ip.html\',\'w\',encoding=\'utf-8\') as fp:
    fp.write(page_text)

print(\'over!\')

 

二、IP池

1、免费IP池

  从西刺代理上面爬取IP,迭代测试能否使用,建立一个自己的代理IP池,随时更新用来抓取网站数据

 

 

import requests
from lxml import etree
import time
import random
from fake_useragent import UserAgent


class GetProxyIP(object):
    def __init__(self):
        self.url = \'https://www.xicidaili.com/nn/\'
        self.proxies = {
            \'http\': \'http://163.204.247.219:9999\',
            \'https\': \'http://163.204.247.219:9999\'}

    # 随机生成User-Agent
    def get_random_ua(self):
        ua = UserAgent()        # 创建User-Agent对象
        useragent = ua.random
        return useragent

    # 从西刺代理网站上获取随机的代理IP
    def get_ip_file(self, url):
        headers = {\'User-Agent\': self.get_random_ua()}
        html = requests.get(url=url, proxies=self.proxies, headers=headers, timeout=5).content.decode(\'utf-8\', \'ignore\')
        parse_html = etree.HTML(html)        
        tr_list = parse_html.xpath(\'//tr\')              # 基准xpath,匹配每个代理IP的节点对象列表
        
        for tr in tr_list[1:]:
            ip = tr.xpath(\'./td[2]/text()\')[0]
            port = tr.xpath(\'./td[3]/text()\')[0]            
            self.test_proxy_ip(ip, port)                # 测试ip:port是否可用

    # 测试抓取的代理IP是否可用
    def test_proxy_ip(self, ip, port):
        proxies = {
            \'http\': \'http://{}:{}\'.format(ip, port),
            \'https\': \'https://{}:{}\'.format(ip, port), }
        test_url = \'http://www.baidu.com/\'
        try:
            res = requests.get(url=test_url, proxies=proxies, timeout=8)
            if res.status_code == 200:
                print(ip, ":", port, \'Success\')
                with open(\'proxies.txt\', \'a\') as f:
                    f.write(ip + \':\' + port + \'\n\')
        except Exception as e:
            print(ip, port, \'Failed\')

    def main(self):
        for i in range(1, 1001):
            url = self.url.format(i)
            self.get_ip_file(url)
            time.sleep(random.randint(5, 10))


if __name__ == \'__main__\':
    spider = GetProxyIP()
    spider.main()

从IP池中取IP,也就是在爬虫程序中从文件随机获取代理IP

import random
import requests


class BaiduSpider(object):
    def __init__(self):
        self.url = \'http://www.baidu.com/\'
        self.headers = {\'User-Agent\': \'Mozilla/5.0\'}
        self.flag = 1

    def get_proxies(self):
        with open(\'proxies.txt\', \'r\') as f:
            result = f.readlines()                  # 读取所有行并返回列表
        proxy_ip = random.choice(result)[:-1]       # 获取了所有代理IP
        L = proxy_ip.split(\':\')
        proxy_ip = {
            \'http\': \'http://{}:{}\'.format(L[0], L[1]),
            \'https\': \'https://{}:{}\'.format(L[0], L[1])
        }
        return proxy_ip

    def get_html(self):
        proxies = self.get_proxies()
        if self.flag <= 3:
            try:
                html = requests.get(url=self.url, proxies=proxies, headers=self.headers, timeout=5).text
                print(html)
            except Exception as e:
                print(\'Retry\')
                self.flag += 1
                self.get_html()


if __name__ == \'__main__\':
    spider = BaiduSpider()
    spider.get_html()

 

2.收费代理API

 写一个获取收费开放API代理的接口

import requests
from fake_useragent import UserAgent

ua = UserAgent()                        # 创建User-Agent对象
useragent = ua.random
headers = {\'User-Agent\': useragent}


def ip_test(ip):
    url = \'http://www.baidu.com/\'
    ip_port = ip.split(\':\')
    proxies = {
        \'http\': \'http://{}:{}\'.format(ip_port[0], ip_port[1]),
        \'https\': \'https://{}:{}\'.format(ip_port[0], ip_port[1]),
    }
    res = requests.get(url=url, headers=headers, proxies=proxies, timeout=5)
    if res.status_code == 200:
        return True
    else:
        return False


# 提取代理IP
def get_ip_list():
    # 快代理:https://www.kuaidaili.com/doc/product/dps/
    api_url = \'http://dev.kdlapi.com/api/getproxy/?orderid=946562662041898&num=100&protocol=1&method=2&an_an=1&an_ha=1&sep=2\'
    html = requests.get(api_url).content.decode(\'utf-8\', \'ignore\')
    ip_port_list = html.split(\'\n\')

    for ip in ip_port_list:
        with open(\'proxy_ip.txt\', \'a\') as f:
            if ip_test(ip):
                f.write(ip + \'\n\')


if __name__ == \'__main__\':
    get_ip_list()

 

3.私密代理

1、语法结构

  用户名和密码会在给API_URL的时候给。不是自己的账号和账号密码。

proxies = {
\'协议\':\'协议://用户名:密码@IP:端口号\'
}
proxies = {
    \'http\':\'http://用户名:密码@IP:端口号\',
    \'https\':\'https://用户名:密码@IP:端口号\'
}
proxies = {
    \'http\': \'http://309435365:szayclhp@106.75.71.140:16816\',
    \'https\':\'https://309435365:szayclhp@106.75.71.140:16816\',
} 

 

# 获取开放代理的接口
import requests
from fake_useragent import UserAgent

ua = UserAgent()  # 创建User-Agent对象
useragent = ua.random
headers = {\'User-Agent\': useragent}


def ip_test(ip):
    url = \'https://blog.csdn.net/qq_34218078/article/details/90901602/\'
    ip_port = ip.split(\':\')
    proxies = {
        \'http\': \'http://1786088386:b95djiha@{}:{}\'.format(ip_port[0], ip_port[1]),
        \'https\': \'http://1786088386:b95djiha@{}:{}\'.format(ip_port[0], ip_port[1]),
    }

    res = requests.get(url=url, headers=headers, proxies=proxies, timeout=5)
    if res.status_code == 200:
        print("OK")
        return True
    else:
        print(res.status_code)
        print("错误")
        return False


# 提取代理IP
def get_ip_list():
    # 快代理:https://www.kuaidaili.com/doc/product/dps/
    api_url = \'http://dps.kdlapi.com/api/getdps/?orderid=986603271748760&num=1000&signature=z4a5b2rpt062iejd6h7wvox16si0f7ct&pt=1&sep=2\'
    html = requests.get(api_url).content.decode(\'utf-8\', \'ignore\')
    ip_port_list = html.split(\'\n\')

    for ip in ip_port_list:
        with open(\'proxy_ip.txt\', \'a\') as f:
            if ip_test(ip):
                f.write(ip + \'\n\')


if __name__ == \'__main__\':
    get_ip_list()

 

思路:

  • 写一个类;
  • get_ip() requests请求接口,得到ip和port;
  • test_ip() 请求某一网站,根据状态码或in判断是否有某一内容来判断此ip是否可用,返回Ture和False即可;
  • save_ip()测试成功后保存;