Python 爬虫实例(2)—— 爬取今日头条

时间:2023-03-09 06:00:04
Python 爬虫实例(2)—— 爬取今日头条
# coding:utf-8
import base64
import random, re
import sqlite3
import redis, pickle
import json, time
import urllib3,urllib2,hashlib
from datetime import datetime
import threading
import logging.handlers
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import uuid import requests session = requests.session()
#把连接加密成 MD5 生成唯一的主键
def md5(str):
import hashlib
m = hashlib.md5()
m.update(str)
return m.hexdigest() def jinri():
list_data = []
for i in range(1,20):
  #请求得到url 链接
url = "http://www.toutiao.com/api/pc/feed/"
data = { "category":"news_game",
"utm_source":"toutiao",
"widen":str(i),
"max_behot_time":"",
"max_behot_time_tmp":"",
"tadrequire":"true",
"as":"479BB4B7254C150",
"cp":"7E0AC8874BB0985",
}
headers = { "Host":"www.toutiao.com",
"Connection":"keep-alive",
"Accept":"text/javascript, text/html, application/xml, text/xml, */*",
"X-Requested-With":"XMLHttpRequest",
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
"Content-Type":"application/x-www-form-urlencoded",
"Referer":"http://www.toutiao.com/ch/news_hot/",
"Accept-Encoding":"gzip, deflate",
"Accept-Language":"zh-CN,zh;q=0.8", } result1 = session.get(url=url,params=data,headers=headers).text
result2 =json.loads(result1)
if result2["message1"] =="success": for i in result2["data"]:
source_url =i["source_url"] headers = { "Host":"www.toutiao.com",
"Connection":"keep-alive",
"Cache-Control":"max-age=0",
"Upgrade-Insecure-Requests":"",
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding":"gzip, deflate",
"Accept-Language":"zh-CN,zh;q=0.8",
}
url1 = "http://www.toutiao.com" + str(source_url)
try: return_data = session.get(url=url1, headers=headers).content
except:
pass
# print return_data
try:
contentData = re.findall(' <article>(.*?)</article>',return_data)[0]
except:
contentData = "" cx = sqlite3.connect("C:\\Users\\xuchunlin\\PycharmProjects\\study\\db.sqlite3",check_same_thread=False)
cx.text_factory = str try:
print "正在插入链接 %s 数据" % (url) chinese_ta = i["chinese_tag"]
media_avatar_url = i["media_avatar_url"]
is_feed_ad = i["is_feed_ad"]
tag_url = i["tag_url"]
title = i["title"]
tag = i["tag"]
label = str(i["label"])
abstract = i["abstract"]
source_url = i["source_url"] print title
print chinese_ta
print media_avatar_url
print is_feed_ad
print tag_url
print tag
print label
print abstract
print source_url url2 = md5(str(url1)) cx.execute("INSERT INTO toutiao (title,chinese_ta,media_avatar_url,is_feed_ad,tag_url,tag,label,abstract,source_url,url,contentData)VALUES (?,?,?,?,?,?,?,?,?,?,?)",
(str(title), str(chinese_ta), str(media_avatar_url), str(is_feed_ad), str(tag_url), str(tag), str(label), str(abstract), str(source_url), str(url2),str(contentData)))
cx.commit() # time.sleep(2)
except Exception as e:
print e
print "cha ru shi bai " cx.close() else:
print "请求失败" return list_data print jinri()

爬虫很简单,难的是自己去分析网页解析网页和爬虫的效率