Webcollector + Spring + MVC 搭建应用初探(二)

时间:2022-12-15 10:50:58

Webcollector + Spring + MVC 搭建应用初探(一)的代码内容中已经完成了基本的数据抓取

部分,但由于Webcollector的“”强大“”导致,按与前面的内容相同的抓取速率对up主信息进行

爬取会被禁掉,所以将前述depth 3 部分改用单线程,并使用Spring初探(七)中的

时间调度部分进行调度运行。

下面是代码,可以与Webcollector + Spring + MVC 搭建应用初探(一)中depth 3部分向对照:

package CrawlerGroup.crawl.AnchorCrawler;

import CrawlerGroup.crawl.JdbcManager.AnchorJDBCTemplate;
import org.apache.http.*;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

import CrawlerGroup.crawl.JedisManager.RedisAPI;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import java.util.ArrayList;

import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.protocol.HTTP;
import org.apache.http.util.EntityUtils;

import org.json.JSONObject;

/**
* Created by admin on 2016/12/27.
*/
@Component
public class ScheduleTask {

private JedisPool pool = RedisAPI.getPool();
private final String anchorZSet = "Anchor:BiliBili";
private AnchorJDBCTemplate anchorJDBCTemplate;
public void setAnchorJDBCTemplate(AnchorJDBCTemplate anchorJDBCTemplate){this.anchorJDBCTemplate = anchorJDBCTemplate;}

@Scheduled(cron="0/5 * * * * ? ")
public void runCrawlerInstance() throws Exception{

Jedis jedis = pool.getResource();
Object[] mid_array = jedis.zrange(anchorZSet, 0, 0).toArray();
if (mid_array.length > 0)
{
String mid = (String) mid_array[0];
jedis.zrem(anchorZSet, mid);

String url = "http://space.bilibili.com/ajax/member/GetInfo";
HttpPost httppost = new HttpPost(url);
ArrayList<BasicNameValuePair> params = new ArrayList<>();
params.add(new BasicNameValuePair("mid", mid));
httppost.setEntity(new UrlEncodedFormEntity(params,HTTP.UTF_8));
httppost.setHeader("Host", "space.bilibili.com");
httppost.setHeader("Origin", "http://space.bilibili.com");
httppost.setHeader("Referer", String.format("http://space.bilibili.com/%s/", mid));

HttpResponse response=new DefaultHttpClient().execute(httppost);

if(response.getStatusLine().getStatusCode()==200) {//如果状态码为200,就是正常返回
String result = EntityUtils.toString(response.getEntity());
//得到返回的字符串
JSONObject json = new JSONObject(result);
//System.out.println("json : " + json);

String anchor = json.getJSONObject("data").getString("name");
int i_mid = Integer.parseInt(mid);
int fan_num = json.getJSONObject("data").getInt("fans");
int play_num = json.getJSONObject("data").getInt("playNum");

anchorJDBCTemplate.create(anchor, i_mid, play_num, fan_num);
}
}
}



}
AnchorJDBCTemplate

package CrawlerGroup.crawl.JdbcManager;

import cn.edu.hfut.dmic.webcollector.fetcher.Executor;
import org.springframework.jdbc.core.JdbcTemplate;

import javax.sql.DataSource;
import java.util.List;

/**
* Created by admin on 2016/12/26.
*/
public class AnchorJDBCTemplate implements AnchorDAO {
private DataSource dataSource;
private JdbcTemplate jdbcTemplateObject;

public void setDataSource(DataSource dataSource)
{
this.dataSource = dataSource;
this.jdbcTemplateObject = new JdbcTemplate(dataSource);
}

public void create(String name, Integer tid, Integer play_num, Integer fan_num)
{

int count = jdbcTemplateObject.queryForObject("select count(*) from Anchor where TID=?;", new Object[] { tid }, Integer.class);
if (count != 0)
{
update(tid, play_num, fan_num);
return;
}

String SQL = "insert into Anchor (NAME, TID, PLAYNUM, FANNUM, CHANGETIME) values (?,?,?,?,NOW())";

jdbcTemplateObject.update(SQL, name, tid, play_num, fan_num);
System.out.println("Created Record Name = " + name + " Tid = " + tid);


}

public Anchor getAnchor(Integer tid)
{
String SQL = "select * from Anchor where TID = ?";
Anchor anchor = jdbcTemplateObject.queryForObject(SQL, new Object[]{tid}, new AnchorManager());
return anchor;
}

public List<Anchor> listAnchors(){
String SQL = "select * from Anchor";
List<Anchor> anchors = jdbcTemplateObject.query(SQL, new AnchorManager());
return anchors;
}

public void delete(Integer tid)
{
String SQL = "delete from Anchor where TID = ?";
jdbcTemplateObject.update(SQL, tid);
}

public void update(Integer tid, Integer play_num, Integer fan_num)
{
String SQL = "update Anchor set PLAYNUM = ? AND FANNUM = ? AND CHANGETIME = NOW() where TID = ?";
jdbcTemplateObject.update(SQL, play_num, fan_num, tid);
}

}

启动代码:

package CrawlerGroup.crawl.AnchorCrawler;

import org.springframework.beans.factory.BeanFactory;
import org.springframework.context.support.ClassPathXmlApplicationContext;

/**
* Created by ehang on 2016/12/27.
*/
public class Application {

public static void main(String[] args)throws Exception{
BeanFactory factory = new ClassPathXmlApplicationContext("Beans.xml");
}
}

配置文件:

<?xml version="1.0" encoding="UTF-8"?>

<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:context="http://www.springframework.org/schema/context"
xmlns:task="http://www.springframework.org/schema/task"
xsi:schemaLocation="http://www.springframework.org/schema/beans
http://www.springframework.org/schema/beans/spring-beans-3.0.xsd
http://www.springframework.org/schema/task
http://www.springframework.org/schema/task/spring-task-3.0.xsd
http://www.springframework.org/schema/context
http://www.springframework.org/schema/context/spring-context-3.0.xsd">

<task:annotation-driven/>

<context:annotation-config/>
<bean class="org.springframework.beans.factory.annotation.AutowiredAnnotationBeanPostProcessor"/>
<context:component-scan base-package="CrawlerGroup.crawl.AnchorCrawler"/>


<bean id="scheduleTask" class="CrawlerGroup.crawl.AnchorCrawler.ScheduleTask" >
<property name="anchorJDBCTemplate" ref="anchorJDBCTemplate"/>
</bean>


<bean id="dataSource" class = "org.springframework.jdbc.datasource.DriverManagerDataSource" >
<property name = "driverClassName" value = "com.mysql.jdbc.Driver"/>
<property name = "url" value = "jdbc:mysql://localhost:3306/test"/>
<property name = "username" value = "root"/>
<property name = "password" value = ""/>
</bean>

<bean id="anchorJDBCTemplate" class = "CrawlerGroup.crawl.JdbcManager.AnchorJDBCTemplate" >
<property name = "dataSource" ref = "dataSource"/>
</bean>

</beans>


自此数据准备部分结束,下面的任务主要围绕Spring Web Service进行有关页面及数据交互的部分,

参见后续的该类别文章及Spring初探系列。(Webcollector + Spring + MVC 搭建应用初探(三)