[Python网络编程]gevent httpclient以及网页编码

之前看到geventhttpclient这个项目，https://github.com/gwik/geventhttpclient，官方文档说非常快，因为响应使用了C的解析，所以我一直想把这玩意用到项目中，

这两天一直在纠结这玩意，说实在一句话，比較难用，封装的不给力，最大缺陷例如以下：

1.不支持重定向，重定向须要自己来写，非常费事

2.新建的httpclient对象仅仅能发送同域名的请求

这相当的蛋疼，我花了一点时间封装了一下，攻克了上面的两个问题，还添加了自己主动编解码问题，代码例如以下：

#!/usr/bin/env python

#-*-encoding:UTF-8-*-

import re

from geventhttpclient.url import URL

from geventhttpclient.client import HTTPClient,HTTPClientPool

from urlparse import urljoin

#from core.common import urljoin

HEADERS = {

    'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0'

}

DEFAULF_METHOD = "GET"

MAX_REDIRECT_TIME = 10

DEFAULT_PAGE_ENCODING = "utf8"

class DifferDomainException(Exception):

    """

    if request different domain url,geventhttpclient will throw it,

    see gevent.client 'raise ValueError("Invalid host in URL")'

    """

    def __init__(self,uri):

        self.args = uri

        self.uri = uri

class MaxRedirectException(Exception):

    def __init__(self,response):

        self.args = response

        self.response = response

class HTTP(HTTPClient):

    def request(self,request_uri, method=DEFAULF_METHOD,body=b"", headers={},follow_redirect=True,redirects=MAX_REDIRECT_TIME):

        if body and method == DEFAULF_METHOD:

            method = "POST"

        h = [ k.title() for k in headers.iterkeys() ]

        headers.update(dict( [ (k,v) for k,v in HEADERS.iteritems() if k not in h ] ))

        response = super(HTTP,self).request(method, request_uri, body, headers)

        if follow_redirect and response.status_code in (301,302,303,307) and response.method in ("GET","POST"):

            if redirects:

                location = response.get('location') or response.get('content-location') or response.get('uri')

                if location:

                    location = urljoin(request_uri,location)

                    if not location.startswith(self._base_url_string):

                        raise DifferDomainException(location)

                    return self.request(location, method, body, headers, follow_redirect,redirects-1)

            else:

                raise MaxRedirectException(response)

        return response

class HTTPPool(HTTPClientPool):

    def get_client(self, url):

        if not isinstance(url, URL):

            url = URL(url)

        client_key = url.host, url.port

        try:

            return self.clients[client_key]

        except KeyError:

            client = HTTP.from_url(url, **self.client_args)

            self.clients[client_key] = client

            return client

_POLL = HTTPPool(network_timeout=100,connection_timeout=100)

META_CHARSET_REGEX = re.compile(r'(?si)<head>.*<meta http-equiv="?content-type"?[^>]+charset=(?P<result>[^">]+).*</head>')

def decodePage(content,content_type):

    httpCharset, metaCharset = None, None

    if content_type and content_type.find("charset=") != -1:

        httpCharset = content_type.split("charset=")[-1]

    match = META_CHARSET_REGEX.search(content)

    if match:

        metaCharset = match.group('result')

    print httpCharset,metaCharset

    charset = httpCharset or metaCharset or DEFAULT_PAGE_ENCODING

    return content.decode(charset).encode(DEFAULT_PAGE_ENCODING)

def request(request_uri, method=DEFAULF_METHOD,body=b"", headers={},follow_redirect=True,auto_read=True):

    client = _POLL.get_client(request_uri)

    response = None

    try:

        response = client.request(request_uri,method,body,headers,follow_redirect)

    except DifferDomainException,e:

        print "DifferDomainException:"+e.uri

        response = request(e.uri,method,body,headers,follow_redirect)

    except MaxRedirectException,e:

        print "max redirect"

        response = e.response # will return previous response,of course redirect response

    except Exception,e:

        print str(e)

    if auto_read and response:

        with response:

            response.content = decodePage(response.read(),response.get('content-type'))

    return response

def test():

    # print request("http://127.0.0.1/re.php",follow_redirect=False)

    # print request("http://127.0.0.1/re.php",follow_redirect=True).content

    r=request("http://www.baidu.com/",follow_redirect=False)

    #baidu utf8 utf8

    print r.content[:10]

    r=request("http://www.163.com/",follow_redirect=False)

    #163 gbk gb2312

    print r.content[:10]

test()

在測试网页编码问题遇到了一些问题，看以下：

[Python网络编程]gevent httpclient以及网页编码

因为头部的请求先到，所以我们一般觉得返回的内容编码是先依据头部的，假设没有再看页面编码。

我们看网易的编码，头部为gbk，网页为gb2312，但用gb2312解码居然有问题，？？？我非常不解，各位大大们为啥呢？

但用头部gbk解码是正常的，这也证明了头部编码优先。按理说网页编码是告诉浏览器以gb2312显示，但明显有问题，浏览器怎么做到的？

我们再看新浪的，这更让我郁闷了，谁来解救我啊？

[Python网络编程]gevent httpclient以及网页编码