全文检索解决方案(lucene工具类以及sphinx相关资料)

时间:2022-09-28 09:05:02

介绍两种全文检索的技术。

1、  lucene+ 中文分词(IK)

关于lucene的原理,在这里可以得到很好的学习。

http://www.blogjava.net/zhyiwww/archive/2006/07/07/57122.html

本帖主要贴几个关于lucene的工具类。

  • 索引建立
package com.lpm.fanger.search.base;

import java.io.File;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;

import org.apache.commons.beanutils.PropertyUtils;
import org.apache.commons.lang.math.NumberUtils;
import org.apache.commons.lang.time.DateFormatUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.wltea.analyzer.lucene.IKSimilarity;

/**
 * @Intro lucene索引工具类
 * @author Lee
 * @Date 2013-8-22
 */
public class IndexUtils {

	private final static String Globle_Lucene_Path = "D:/lucene_index";
	private final static String KeyWord_Field_Name= "id";
	private final static IKAnalyzer Globle_Analyzer = new IKAnalyzer();
	private final static String FMT_DATE = "yyyyMMddHHmmssSSS";
	private final static NumberFormat FMT_ID = NumberFormat.getInstance();

	static {
		FMT_ID.setGroupingUsed(false);
		FMT_ID.setMaximumFractionDigits(0);
		FMT_ID.setMaximumIntegerDigits(12);
		FMT_ID.setMinimumIntegerDigits(12);
	}

	private IndexUtils(){}
	/**
	 * 当前分词器
	 * @return
	 */
	public final static Analyzer getAnalyzer(){
		return Globle_Analyzer;
	}

	/*********************CRUD************************/
	/*********************CRUD************************/
	/*********************CRUD************************/

	/**
	 * 添加索引(建立索引)
	 * @param clazz 目标对象
	 * @param docs 目标对象详细集合
	 * @return 成功添加索引的条数
	 * @throws Exception
	 */
	public static int add(
			Class<? extends LuceneEnable> clazz,
			List<? extends LuceneEnable> objs) throws Exception{
		if (objs == null || objs.size() == 0)
			return 0;
		IndexWriter writer = getWriter(clazz);
		try {
			int count = add(writer,objs);
			writer.optimize();
			return count;
		}finally{
			writer.close();
			writer = null;
		}
	}

	/**
	 * 添加速印(建立索引)
	 * @param doc 当前文档
	 * @throws Exception
	 */
	public static void add(LuceneEnable doc) throws Exception{
		if(doc == null)
			return;
		IndexWriter writer = getWriter(doc.getClass());
		try{
			//再添加
			writer.addDocument(objectToDocment(doc));
			//提交事务
			writer.commit();
		}finally{
			writer.close();
		}
	}
	/**
	 * 删除索引
	 * @param doc
	 * @throws Exception
	 */
	public static void delete(LuceneEnable doc) throws Exception{
		if(doc == null)
			return;
		IndexWriter writer = getWriter(doc.getClass());
		try{
			writer.deleteDocuments(new Term("id", String.valueOf(doc.getPrimeryKey())));
			writer.commit();
		}finally{
			writer.close();
		}
	}
	/**
	 * 更新索引
	 * @param doc
	 * @throws Exception
	 */
	public static void update(LuceneEnable doc) throws Exception{
		if(doc == null)
			return;
		IndexWriter writer = getWriter(doc.getClass());
		try{
			//先删除
			writer.deleteDocuments(new Term("id", String.valueOf(doc.getPrimeryKey())));
			//再添加
			writer.addDocument(objectToDocment(doc));
			//提交事务
			writer.commit();
		}finally{
			writer.close();
		}
	}
	/**********查找**********/
	/**********查找**********/

	/**
	 * 索引库中查找满足条件的主键结果集
	 * @param clazz
	 * @param query
	 * @param maxCount
	 * @return 满足条件的主键结果集
	 * @throws Exception
	 */
	public static List<Long> find(
			Class<? extends LuceneEnable> clazz,
			Query query,int maxCount) throws Exception{
		IndexSearcher reader = getReader(clazz);
		try{
			//获取查询结果
			TopDocs hits = reader.search(query, null,maxCount);
			if(hits == null)
				return null;
			List<Long> results = new ArrayList<Long>();
			//取得结果数
			int num = Math.min(hits.totalHits, maxCount);
			for(int i = 0; i < num ;i++){
				ScoreDoc scoreDoc = hits.scoreDocs[i];
				Document doc = reader.doc(scoreDoc.doc);
				Long primaryKey = NumberUtils.toLong(doc.get(KeyWord_Field_Name));
				if(primaryKey > 0 && !results.contains(primaryKey)){
					//满足条件值,加到结果集合
					results.add(primaryKey);
				}
			}
			return results;
		}finally{
			reader.close();
		}
	}

	/**
	 * 索引库中查找满足条件的【对象】结果集
	 * @param clazz
	 * @param query
	 * @param maxCount
	 * @return
	 * @throws Exception
	 */
	public static List<? extends LuceneEnable> findList(
			Class<? extends LuceneEnable> clazz,
			Query query,int maxCount) throws Exception{
		IndexSearcher reader = getReader(clazz);
		List results = new ArrayList();
		try{
			TopDocs hits = reader.search(query, null, maxCount);
			if(hits == null){
				return null;
			}
			//找最小集合长度
			int num = Math.min(hits.totalHits, maxCount);
			for(int i=0;i<num;i++){
				//循环找到对象集合
				ScoreDoc scoreDoc = hits.scoreDocs[i];
				Document doc = reader.doc(scoreDoc.doc);
				//实例化对象属性
				Object obj = documentToObject(clazz, doc);
				if(obj != null){
					results.add(obj);
				}
			}
			return results;
		}finally{
			reader.close();
		}
	}

	/**
	 * 获取全文查询对象
	 *
	 * 任意参数
	 * @param booleanClauses
	 * @return
	 */
	public static BooleanQuery getFullTextQuery(BooleanClause... booleanClauses){
		BooleanQuery booleanQuery = new BooleanQuery();
		for (BooleanClause booleanClause : booleanClauses){
			booleanQuery.add(booleanClause);
		}
		return booleanQuery;
	}

	/**
	 * 获取全文查询对象
	 * @param q 查询关键字
	 * @param fields 查询字段(任意多)
	 * @return 全文查询对象
	 */
	public static BooleanQuery getFullTextQuery(String q, String... fields){
		Analyzer analyzer = new IKAnalyzer();
		BooleanQuery query = new BooleanQuery();
		try {
			if (q != null && !q.equals("")){
				for (String field : fields){
					QueryParser parser = new QueryParser(Version.LUCENE_36, field, analyzer);
					query.add(parser.parse(q), Occur.SHOULD);
				}
			}
		} catch (ParseException e) {
			e.printStackTrace();
		}
		return query;
	}

	/************助手方法**************/
	/************助手方法**************/
	/************助手方法**************/

	/**
	 * 添加索引助手类
	 * @param indexWriter
	 * @param docs
	 * @return
	 */
	protected static int add(IndexWriter writer,List<? extends LuceneEnable> objs) throws Exception{
		if(objs == null || objs.size() == 0){
			return 0;
		}
		int count = 0;
		for(LuceneEnable obj : objs){
			Document doc = objectToDocment(obj);
			doc.setBoost(obj.GetBoost());
			writer.addDocument(doc);
			count++;
		}
		return count;
	}
	/**
	 * 获取索引写
	 * @param clazz
	 * @return
	 * @throws IOException
	 */
	protected static IndexWriter getWriter(Class<?> clazz) throws IOException{
		String path = Globle_Lucene_Path + File.separator + clazz.getSimpleName();
		Directory indexDir = FSDirectory.open(new File(path));
		return new IndexWriter(
				indexDir,
				Globle_Analyzer,
				IndexWriter.MaxFieldLength.UNLIMITED);
	}
	/**
	 * 获取索引读
	 * @param clazz
	 * @return
	 * @throws IOException
	 */
	protected static IndexSearcher getReader(Class<?> clazz) throws IOException{
		String path = Globle_Lucene_Path + File.separator + clazz.getSimpleName();
		Directory indexDir = FSDirectory.open(new File(path));
		IndexSearcher reader = new IndexSearcher(indexDir);
		//使用ik的相似度评估器
		Similarity similarity = new IKSimilarity();
		reader.setSimilarity(similarity);
		return reader;
	}
	/**
	 * Document转换成对象
	 * @param clazz
	 * @param doc
	 * @return
	 * @throws Exception
	 */
	private static Object documentToObject(Class<? extends LuceneEnable> clazz,Document doc) throws Exception{
		Object obj = clazz.newInstance();
		java.lang.reflect.Field[] fields = clazz.getDeclaredFields();
		for(java.lang.reflect.Field field : fields){
			String name = field.getName();
			String value = doc.get(name);
			if(name ==null || name.equals("") || value ==null || value.equals("") )
				continue;//进入一个字段
			setFieldValue(obj, name, value);//需要调试
		}
		return null;
	}

	/**
	 * 对象转换成Documents
	 * @param obj
	 * @return
	 * @throws Exception
	 */
	private static Document objectToDocment(LuceneEnable obj) throws Exception{
		Document doc = new Document();
		//设置关键字域
		doc.add(keyWord(KeyWord_Field_Name, FMT_ID.format(obj.getPrimeryKey())));
		//设置索引域
		String[] indexFields = obj.GetIndexFields();
		if(indexFields != null && indexFields.length > 0){
			for(String indexField : indexFields){
				String value = getFieldValue(obj, indexField);
				if(value != null && !value.equals("")){
					doc.add(index(indexField, value));
				}
			}
		}
		//设置存储域
		String[] storeFields = obj.GetStoreFields();
		if(storeFields != null && storeFields.length > 0){
			for(String storeField : storeFields){
				String value = getFieldValue(obj, storeField);
				if(value != null && !value.equals("")){
					doc.add(keyWord(storeField, value));
				}
			}
		}
		//设置扩展索引值
		HashMap<String, String> extendIndex = obj.GetExtendIndexValues();
		if(extendIndex != null){
			for(String key : extendIndex.keySet()){
				String value = extendIndex.get(key);
				doc.add(index(key, value));
			}
		}
		//设置扩展值
		HashMap<String, String> extend = obj.GetExtendValues();
		if(extend != null){
			for(String key : extend.keySet()){
				String value = extend.get(key);
				doc.add(keyWord(key, value));
			}
		}
		return doc;
	}

	/**
	 * 构造关键字域
	 * @param name
	 * @param value
	 * @return (关键字)域/字段
	 */
	private static final Field keyWord(String name,String value){
		return new Field(name, value, Field.Store.YES, Field.Index.NOT_ANALYZED);
	}
	/**
	 * 构造索引域
	 * @param name
	 * @param value
	 * @return (索引)域/字段
	 */
	private static final Field index(String name,String value){
		return new Field(name, value, Field.Store.YES, Field.Index.ANALYZED);
	}

	/**
	 * 获取对象属性值
	 * @param obj
	 * @param fieldName
	 * @return
	 * @throws Exception 只支持属性类型为String/integer/double/float等基本类型
	 */
	private static String getFieldValue(Object obj,String fieldName) throws Exception{
		Object fieldValue = PropertyUtils.getProperty(obj, fieldName);
		if(fieldValue instanceof String)
			return (String)fieldValue;
		if(fieldValue instanceof Date)
			return DateFormatUtils.format((Date)fieldValue, FMT_DATE);
		return String.valueOf(fieldValue);
	}

	/**
	 * 设置属性值
	 * @param obj
	 * @param fieldName
	 * @param fieldValue
	 * @throws Exception 只支持属性类型为String/integer/double/float等基本类型
	 */
	private static void setFieldValue(Object obj,String fieldName,String fieldValue) throws Exception{
		PropertyUtils.setProperty(obj, fieldName, fieldValue);
	}
}

  

  • 查询
package com.lpm.fanger.search.base;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

/**
 * @Intro Lucene搜索工具类
 * @author Lee
 * @Date 2013-8-24
 */
public class LuceneSearchUtils {
	/**
	 * 获取全文查询对象
	 */
	public static BooleanQuery getFullTextQuery(BooleanClause... booleanClauses){
		BooleanQuery booleanQuery = new BooleanQuery();
		for (BooleanClause booleanClause : booleanClauses){
			booleanQuery.add(booleanClause);
		}
		return booleanQuery;
	}

	/**
	 * 获取全文查询对象
	 * @param q 查询关键字
	 * @param fields 查询字段
	 * @return 全文查询对象
	 */
	public static BooleanQuery getFullTextQuery(String q, String... fields){
		Analyzer analyzer = new IKAnalyzer();
		BooleanQuery query = new BooleanQuery();
		try {
			if (q != null && !q.equals("")){
				for (String field : fields){
					QueryParser parser = new QueryParser(Version.LUCENE_36, field, analyzer);
					query.add(parser.parse(q), Occur.SHOULD);
				}
			}
		} catch (ParseException e) {
			e.printStackTrace();
		}
		return query;
	}
}

  

  • 使用

一般在项目中单独开一个端口,不断的更新索引。

/**
	 * 构建索引
	 * @param objClass
	 * @return
	 */
	private static int _BuildIndexOfObject(Class<? extends LuceneEnable> objClass) throws Exception {
		int ic = 0;
		long last_id = 0L;
		do {
			List<? extends LuceneEnable> objs = dao.listAfter(last_id,BATCH_COUNT);
			if(objs != null && objs.size()>0){
				ic  += IndexUtils.add(objClass, objs);
				last_id = objs.get(objs.size()-1).getPrimeryKey();
			}
			if(objs == null || objs.size() < BATCH_COUNT)
				break;
		}while(true);

		return ic;
	}

  

  • 测试
private final static Log log = LogFactory.getLog(RebuildLuceneIndex.class);
	private final static int BATCH_COUNT = 500;

//	static BookDao dao;
//	static AticleDao dao;
	static ExampleDao dao;

	static{
		ApplicationContext app = new ClassPathXmlApplicationContext("spring.xml");
//		dao = app.getBean("bookDao", BookDao.class);
//		dao = app.getBean("aticleDao", AticleDao.class);
		dao = app.getBean("exampleDao", ExampleDao.class);
	}

	@SuppressWarnings({ "rawtypes", "unchecked" })
	public static void main(String[] args) throws Exception {

		String beanName = Example.class.getName();//Book.class.getName();//Aticle.class.getName();//
		Class beanClass = Class.forName(beanName);
//		Long t1 = System.currentTimeMillis();
//		int ic = _BuildIndexOfObject(beanClass);
//		log.info(ic + " documents of " + beanName + " created.");
//		System.out.println("TIME:"+(System.currentTimeMillis() - t1)+"ms");

		Long t2 = System.currentTimeMillis();
		Query query =// LuceneSearchUtils.getFullTextQuery("神奇校车", new String[]{"bookName"});//,"outline"}
		IKQueryParser.parseMultiField(new String[]{"title"}, "选择");//经过测试,这个方法比较好一点
		//LuceneSearchUtils.getFullTextQuery("java", new String[]{"book_name","out_line"});//IKQueryParser.parseMultiField(new String[]{"title","content"}, "c++");

		List<Long> list = IndexUtils.find(beanClass, query, 100);//LuceneIndexUtils.find(beanClass, query, 100);
		//List<Aticle> list = (List<Aticle>) LuceneIndexUtils.find(beanClass, query, 100, false);
		//List<Book> list = (List<Book>) LuceneIndexUtils.find(beanClass, query, 100, false);
		System.out.println(list.size());
		System.out.println("TIME:"+(System.currentTimeMillis() - t2)+"ms");
		System.exit(0);
	}

  

  • 相关的bean
package com.lpm.fanger.search.base;

import java.util.HashMap;

import javax.persistence.GeneratedValue;
import javax.persistence.GenerationType;
import javax.persistence.Id;
import javax.persistence.Table;

/**
 * @Intro descrption here
 * @author Lee
 * @Date 2013-8-24
 */
@Table(name="t_article")
public class Example implements LuceneEnable{

	private Integer id;
	private String title;
	private String content;
	private String tag;

	/************getter and setter**************/
	@Id
	@GeneratedValue(strategy=GenerationType.IDENTITY)
	public Integer getId() {
		return id;
	}

	public void setId(Integer id) {
		this.id = id;
	}

	public String getTitle() {
		return title;
	}

	public void setTitle(String title) {
		this.title = title;
	}

	public String getContent() {
		return content;
	}

	public void setContent(String content) {
		this.content = content;
	}

	public String getTag() {
		return tag;
	}

	public void setTag(String tag) {
		this.tag = tag;
	}

	/************override method**************/
	@Override
	public Long getPrimeryKey() {
		return Long.valueOf(this.getId());
	}

	@Override
	public String[] GetStoreFields() {
		return new String[]{"tag"};
	}

	@Override
	public String[] GetIndexFields() {
		return new String[]{"title","content"};
	}

	@Override
	public HashMap<String, String> GetExtendValues() {
		return null;
	}

	@Override
	public HashMap<String, String> GetExtendIndexValues() {
		return null;
	}

	@Override
	public float GetBoost() {
		return 0;
	}

}

  

  • 相关的接口(重要)
package com.lpm.fanger.search.base;

import java.util.HashMap;
import java.util.List;

/**
 * @Intro 支持搜索lucene全文检索
 * 功能的Bean类需要实现该接口
 * @author Lee
 * @Date 2013-8-24
 */
public interface LuceneEnable {
	/**
	 * 获取搜索对象的关键字,
	 * 便于搜索得到分析后,得到记录的主键值,
	 * 这样就可以通过查数据库表的方式,来得
	 * 到记录的完整情况
	 * @return
	 */
	public Long getPrimeryKey();

	/**
	 * 返回搜索对象需要存储的字段名,例如createTime, author等
	 * @return
	 */
	public String[] GetStoreFields();

	/**
	 * 返回搜索对象的索引字段,例如title,content
	 * @return
	 */
	public String[] GetIndexFields();

	/**
	 * 返回对象的扩展信息
	 * @return
	 */
	public HashMap<String, String> GetExtendValues();

	/**
	 * 返回对象的扩展索引信息
	 * @return
	 */
	public HashMap<String, String> GetExtendIndexValues();

	/**
	 * 返回文档的权重
	 * @return
	 */
	public float GetBoost();

}

  

  • 相关的dao
package com.lpm.fanger.jdbc.dao;

import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;

import org.springframework.jdbc.core.RowMapper;
import org.springframework.stereotype.Repository;

import com.lpm.fanger.jdbc.mysql.BaseDaoMysqlImpl;
import com.lpm.fanger.search.base.Example;

/**
 * @Intro db interface
 * @author Lee
 * @Date 2013-8-26
 */
@Repository("exampleDao")
public class ExampleDao extends BaseDaoMysqlImpl<Example, Integer>{
	public ExampleDao(){
		super(Example.class);
	}

	public List<Example> listAfter(Long begain,Integer count){
		List<Object> values = new ArrayList<Object>();
		values.add(begain);
		values.add(count);
		String sql = "select * from "+getTableName()+" limit ?,?";
		List<Example> list = search(sql, values,new ExampleRowMappere());
		return list;
	}
}

class ExampleRowMappere implements RowMapper<Example>{

	@Override
	public Example mapRow(ResultSet rs, int value) throws SQLException {
		Example ex = new Example();
		ex.setContent(rs.getString("content"));
		ex.setTitle(rs.getString("title"));
		ex.setTag(rs.getString("tag"));
		ex.setId(rs.getInt("id"));
		return ex;
	}

}

  

2、  mysql + sphinx

这中技术架构,有很好的性能,主要的工作放到了插件sphinx

相关资料:包括原理,实例以及安装,查询语句的书写等等。

http://pan.baidu.com/share/link?shareid=152940799&uk=572544164

感谢书写这些文档的前辈以及大牛门。如有侵权,请您给我留言,我会把这个链接拿掉。