1. 输入文件为
fufang_list.txt
yaofang_a aaiwan 阿艾丸
yaofang_a aaiwulingsan 阿艾五苓散
yaofang_a acaitang 阿菜汤
yaofang_a afurongjiu 阿芙蓉酒
yaofang_a aqietuoyao 阿伽陀药
yaofang_a aweichubisan 阿魏搐鼻散
yaofang_a aweigao 阿魏膏
yaofang_a aweigaoyao 阿魏膏药
yaofang_a aweihuapigao 阿魏化痞膏
yaofang_a aweihuapisan 阿魏化痞散
yaofang_a aweijikuaiwan 阿魏积块丸
yaofang_a aweileiwansan 阿魏雷丸散
yaofang_a aweilizhongwan 阿魏理中丸
yaofang_a aweiliangjiangwan 阿魏良姜丸
yaofang_a aweiruanjiansan 阿魏软坚散
yaofang_a aweisan 阿魏散
yaofang_a aweishexiangsan 阿魏麝香散
yaofang_a aweitongjingwan 阿魏通经丸
yaofang_a aweiwan 阿魏丸
yaofang_a aweiwanlinggao 阿魏万灵膏
2. 爬虫脚本
get_tcmdata.py
#!/usr/bin/python
#coding:utf8
from __future__ import print_function
import click
import urllib2
import re
from bs4 import BeautifulSoup
import sys
reload(sys)
import socket
sys.setdefaultencoding("utf8")
socket.setdefaulttimeout(20) base_url = "http://www.zysj.com.cn/zhongyaofang/{}.html"
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} @click.command()
@click.argument('input1')
@click.option("--pos",'-pos')
def query_tcm_info(input1,pos):
"""
the script will ignore the previous pos lines
"""
zhongyaofang_list = open(input1)
pos = int(pos)
num = 0
if pos:
for i in range(0,pos):
zhongyaofang_list.readline()
num = num + pos
for zhongyaofang_info in zhongyaofang_list:
num = num +1
zhongyaofang_info_list = zhongyaofang_info.strip("\n").split("\t")
url_id = "/".join(zhongyaofang_info_list[0:2])
file_out = "_".join(zhongyaofang_info_list[0:2])
file_out_name = "_".join([file_out,str(num)])
output_file = open(file_out_name+".txt","w")
query_url = base_url.format(url_id)
req = urllib2.Request(query_url,headers = headers)
content = urllib2.urlopen(req,timeout=20).read()
soup = BeautifulSoup(content)
words = soup.getText()
output_file.write(words) if __name__ == "__main__":
query_tcm_info()
3. 运行脚本命令
python get_tcmdata.py fufang_list.txt --pos 0
4. 简单百度爬虫
#!/usr/bin/python
#coding:utf8
from __future__ import print_function
import sys
reload(sys)
sys.setdefaultencoding("utf8")
import urllib2 request = urllib2.Request(url)
request.add_data('a',"")
request.add_heder('User-Agent',"Mozilla/5.0")
response = urllib2.urlopen(request)
cont = response.read()
print(cont)
m = requests.post("http://www.megabionet.org/tcmid/ingredientsearch/?name=adonitol")
m.url