使用Python3编写抓取网页和只抓网页图片的脚本

时间:2022-10-27 15:22:23

最基本的抓取网页内容的代码实现:

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#!/usr/bin/env python
 
from urllib import urlretrieve
 
def firstNonBlank(lines):
  for eachLine in lines:
    if not eachLine.strip():
      continue
    else:
      return eachLine
 
def firstLast(webpage):
  f = open(webpage)
  lines = f.readlines()
  f.close()
  print firstNonBlank(lines),
  lines.reverse()
  print firstNonBlank(lines),
 
def download(url='http://www',process=firstLast):
  try:
    retval = urlretrieve(url)[0]
  except IOError:
    retval = None
  if retval:
    process(retval)
 
if __name__ == '__main__':
  download()

利用urllib模块,来实现一个网页中针对图片的抓取功能:

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import urllib.request
import socket
import re
import sys
import os
targetDir = r"C:\Users\elqstux\Desktop\pic"
def destFile(path):
  if not os.path.isdir(targetDir):
    os.mkdir(targetDir)
  pos = path.rindex('/')
  t = os.path.join(targetDir, path[pos+1:])
  return t
 
if __name__ == "__main__":
  hostname = "http://www.douban.com"
  req = urllib.request.Request(hostname)
  webpage = urllib.request.urlopen(req)
  contentBytes = webpage.read()
  for link, t in set(re.findall(r'(http:[^\s]*?(jpg|png|gif))', str(contentBytes))):
    print(link)
    urllib.request.urlretrieve(link, destFile(link))

       

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import urllib.request
import socket
import re
import sys
import os
targetDir = r"H:\pic"
def destFile(path):
  if not os.path.isdir(targetDir):
    os.mkdir(targetDir)
  pos = path.rindex('/')
  t = os.path.join(targetDir, path[pos+1:]) #会以/作为分隔
  return t
 
if __name__ == "__main__":
  hostname = "http://www.douban.com/"
  req = urllib.request.Request(hostname)
  webpage = urllib.request.urlopen(req)
  contentBytes = webpage.read()
  match = re.findall(r'(http:[^\s]*?(jpg|png|gif))', str(contentBytes) )#r'(http:[^\s]*?(jpg|png|gif))'中包含两层圆括号,故有两个分组,
                             #上面会返回列表,括号中匹配的内容才会出现在列表中
  for picname, picType in match:
    print(picname)
    print(picType)
    
 
'''''
输出:
http://img3.douban.com/pics/blank.gif
gif
http://img3.douban.com/icon/g111328-1.jpg
jpg
http://img3.douban.com/pics/blank.gif
gif
http://img3.douban.com/icon/g197523-19.jpg
jpg
http://img3.douban.com/pics/blank.gif
gif
...
'''