Python爬虫（七）

源码：
 import requests

 import re

 from my_mysql import MysqlConnect

 # 获取详情页链接和电影名称

 def get_urls(page):

     url = 'http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'.format(page)

     response = requests.get(url)

     response.encoding = 'gbk'

     # print(res)

     pat = r'<a href="(.*?)" class="ulink">(.*?)</a>'

     res = re.findall(pat, response.text)

     # print(res)

     return res

 # 获取磁力链接

 def get_links(url):

     response = requests.get(url)

     response.encoding = 'gbk'

     html = response.text

     # print(res)

     pat = r'href="(magnet.*?)"'

     res = re.search(pat, html)

     magnet = res.group(1)

     pat = r'href="(ftp.*?)"'

     res = re.search(pat, html)

     ftp = res.group(1)

     return magnet,ftp

 if __name__ == '__main__':

     mc = MysqlConnect('127.0.0.1', 'root', '', 'homework')

     for page in range(1,4):

         res = get_urls(page)

         for url, name in res:

             url = 'http://www.dytt8.net/' + url

             movie_tuple = get_links(url)

             sql = 'insert into dytt(id,name,magnet,ftp) values(null,{},{},{})'.format(repr(name),repr(movie_tuple[0]),repr(movie_tuple[1]))

             print(sql)

             mc.exec(sql)
秒客网

Python爬虫（七）

相关文章