A flash of Joy

import re

from datetime import timedelta

from tornado import httpclient, gen, ioloop, queues

peoples = {'': 71, '': 66, '': 54, '': 50, '': 66, '': 61,

           '': 103, '': 31, '': 32, '': 41, '': 33, '': 93, '': 50, '': 108, '': 55,

           '': 55, '': 92, '': 56, '': 29, '': 27,

           '': 25, '': 25, '': 50, '': 66, '': 68, '': 52, '': 50, '': 50, '': 52, '': 50,

           '': 133, '': 166, '': 10, '': 8, '': 99, '': 18,

           '': 50, '': 24, '': 19, '': 25, '': 24, '': 24, '': 67, '': 52, '': 67,

           '': 67, '': 8, '': 31, '': 82, '': 62, '': 8, '': 104, '': 52, '': 52, '': 47,

           '': 56, '': 72, '': 57, '': 36, '': 50, '': 120, '': 50,

           '': 56}

class AsySpider(object):

    def __init__(self, urls, concurrency=10, results=None, **kwargs):

        urls.reverse()

        self.urls = urls

        self.concurrency = concurrency

        self._q = queues.Queue()

        self._fetching = set()

        self._fetched = set()

        if results is None:

            self.results = []

    def fetch(self, url, **kwargs):

        fetch = getattr(httpclient.AsyncHTTPClient(), 'fetch')

        return fetch(url, raise_error=False, **kwargs)

    def handle_html(self, url, html):

        """handle html page"""

        print(url)

    def handle_response(self, url, response):

        """inherit and rewrite this method if necessary"""

        if response.code == 200:

            self.handle_html(url, response.body)

        elif response.code == 599:  # retry

            self._fetching.remove(url)

            self._q.put(url)

    @gen.coroutine

    def get_page(self, url):

        try:

            response = yield self.fetch(url)

            # print('######fetched %s' % url)

        except Exception as e:

            print('Exception: %s %s' % (e, url))

            raise gen.Return(e)

        raise gen.Return(response)

    @gen.coroutine

    def _run(self):

        @gen.coroutine

        def fetch_url():

            current_url = yield self._q.get()

            try:

                if current_url in self._fetching:

                    return

                # print('fetching****** %s' % current_url)

                self._fetching.add(current_url)

                response = yield self.get_page(current_url)

                self.handle_response(current_url, response)  # handle reponse

                self._fetched.add(current_url)

                for i in range(self.concurrency):

                    if self.urls:

                        yield self._q.put(self.urls.pop())

            finally:

                self._q.task_done()

        @gen.coroutine

        def worker():

            while True:

                yield fetch_url()

        self._q.put(self.urls.pop())  # add first url

        # Start workers, then wait for the work queue to be empty.

        for _ in range(self.concurrency):

            worker()

        yield self._q.join(timeout=timedelta(seconds=300000))

        try:

            assert self._fetching == self._fetched

        except AssertionError:

            print(self._fetching - self._fetched)

            print(self._fetched - self._fetching)

    def run(self):

        io_loop = ioloop.IOLoop.current()

        io_loop.run_sync(self._run)

class MySpider(AsySpider):

    def fetch(self, url, **kwargs):

        """重写父类fetch方法"""

        cookies_str = 'JSESSIONID=0000n4jBi_dKg91XbtHHQHDeeDL:1b4e17j2v; iPlanetDire' \

                      'ctoryPro=AQIC5wM2LY4Sfcxu%' \

                      '2FWPIJWGHttZPiXafd%2B1gowyEoxTmyiY%3D%40AAJTSQACMDE%3D%23'

        headers = {

            'User-Agent': 'mozilla/5.0 (compatible; baiduspider/2.0; +http://www.baidu.com/search/spider.html)',

            'cookie': cookies_str

        }

        return super(MySpider, self).fetch(

            url, headers=headers

        )

    def handle_html(self, url, html):

        url += 'qwertyu'

        pattern = re.compile('userPhoto&ownerId=(.*)qwertyu')

        filename = re.findall(pattern, url)[0]

        # 注意把dir修改成你想要存放照片位置.例如C:/picture/

        dir = '/home/innovation/文档/pic/'

        with open(dir + filename + '.jpg', 'wb') as file:

            file.write(html)

            file.close()

def main():

    urls = []

    url_pic = 'http://myportal.sxu.edu.cn/attachmentDownload.portal?notUseCache=true&type=userPhoto&ownerId='

    for academy in peoples:

        for i in range(peoples[academy]):

            i += 1

            if i < 10:

                i = '' + str(i)

            elif 100 > i >= 10:

                i = '' + str(i)

            urls.append(url_pic + '' + academy + str(i))

    s = MySpider(urls)

    s.run()

if __name__ == '__main__':

    main()
相关文章