本文通过代码简单展示了采用TermQuery和FuzzyLikeThisQuery进行索引查询,并且展示了如何在查询结果中高亮显示匹配的关键字(这在实际使用中是一个很有用的功能)
1 public class Indexer
2 {
3
4 /**
5 * @param args
6 * @throws IOException
7 * @throws LockObtainFailedException
8 * @throws CorruptIndexException
9 * @throws InvalidTokenOffsetsException
10 */
11 public static void main(String[] args) throws CorruptIndexException,
12 LockObtainFailedException, IOException, InvalidTokenOffsetsException
13 {
14 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
15
16 IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
17 config.setOpenMode(OpenMode.CREATE_OR_APPEND);
18
19 Directory indexDir = new RAMDirectory();
20
21 /**
22 * 1. Indexing...
23 */
24 IndexWriter writer = new IndexWriter(indexDir, config);
25
26 File docs = new File("D:\\files");
27
28 if (docs.exists() && docs.isDirectory())
29 {
30 File[] files = docs.listFiles();
31
32 if (files != null && files.length > 0)
33 {
34 for (File file : files)
35 {
36 // •Field.Index.NO 不索引,如果存储选项为YES,一般用于只保存不搜索的字段;
37 // •Field.Index.ANALYZED 分词建索引;
38 // •Field.Index.NOT_ANALYZED 建索引但不分词,字段虽然被索引但是没有任何分析器对字段进行分析,只能整词精确搜索,可保存唯一性字段(例如ID)并用于更新索引
39 Document doc = new Document();
40 doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES, Field.Index.NO));
41 doc.add(new Field("id", file.getName(), Field.Store.YES, Field.Index.NOT_ANALYZED));
42 doc.add(new Field("name", file.getName(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
43
44 doc.add(new Field("size", file.getTotalSpace() + "b", Field.Store.YES, Field.Index.NO));
45
46 writer.addDocument(doc);
47 }
48
49 writer.commit();
50 }
51 }
52
53 writer.close(true);
54
55
56 /**
57 * 2. List indexed files ...
58 */
59 IndexReader reader = IndexReader.open(indexDir);
60 IndexSearcher searcher = new IndexSearcher(reader);
61
62 System.out.println("Max doc:" + searcher.maxDoc());
63 System.out.println("List files below....");
64
65 Document doc = null;
66 for (int i = 0; i < searcher.maxDoc(); i++)
67 {
68 doc = searcher.doc(i);
69 System.out.println("Doc " + i + " Name: " + doc.get("name") + ", Path: " + doc.get("path") + ", Size: " + doc.get("size"));
70 }
71 System.out.println("===================================================================================");
72
73
74 /**
75 * 3.Searching...
76 */
77 String id = "we";
78 // 此处若改为Query queryId = new TermQuery(new Term("id", id));则无法搜索出结果,除非id = "We are young.txt";
79 Query queryId = new TermQuery(new Term("name", id));
80 TopDocs hitsForId = searcher.search(queryId, null, 100);
81 if (hitsForId != null && hitsForId.totalHits > 0)
82 {
83 System.out.println("Searched " + hitsForId.totalHits + " docs for id " + id + "...");
84
85 for (int j = 0; j < hitsForId.scoreDocs.length; j++)
86 {
87 System.out.println("Score doc for id " + j + " is " + hitsForId.scoreDocs[j].toString());
88 }
89 }
90 System.out.println("===================================================================================");
91
92 String keyword = "we are yy";
93 FuzzyLikeThisQuery fuzzyLikeThisQuery = new FuzzyLikeThisQuery(100, analyzer);
94 fuzzyLikeThisQuery.addTerms(keyword, "name", 0.8F, 0);
95
96 // FuzzyLikeThisQuery不是lucene core自带的查询类,属于contrib的query模块
97 // 默认情况下QueryScorer的私有成员WeightedSpanTermExtractor无法识别它,getBestFragment将返回null
98 // 因此此处调用rewrite生成一个WeightedSpanTermExtractor可以识别的query对象,用于匹配内容关键字
99 Query query = fuzzyLikeThisQuery.rewrite(reader);
100
101 // 高亮显示关键字,如果内容中本来就有<span></span>,可能导致显示错乱
102 SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span>", "</span>");
103 Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
104
105 TopDocs hits = searcher.search(fuzzyLikeThisQuery, null, 100);
106
107 if (hits != null && hits.totalHits > 0)
108 {
109 System.out.println("Searched " + hits.totalHits + "docs for keyword " + keyword + "...");
110
111 ScoreDoc[] sDocs = hits.scoreDocs;
112
113 Document docMatched = null;
114 for (int j = 0; j < sDocs.length; j++)
115 {
116 System.out.println("Score doc " + j + " is " + sDocs[j].toString());
117
118 docMatched = searcher.doc(sDocs[j].doc);
119
120 TokenStream tokenStream = analyzer.tokenStream("name", new StringReader(docMatched.get("name")));
121 String str = highlighter.getBestFragment(tokenStream, docMatched.get("name"));
122
123 System.out.println("Score doc " + j + " hightlight to: " + str);
124
125 }
126 }
127
128 reader.close();
129 indexDir.close();
130 }
131 }
输出如下
Max doc:13
List files below....
Doc 0 Name: ab.txt, Path: D:\files\ab.txt, Size: 104857595904b
Doc 1 Name: abc.txt, Path: D:\files\abc.txt, Size: 104857595904b
Doc 2 Name: M_1.txt, Path: D:\files\M_1.txt, Size: 104857595904b
Doc 3 Name: M_11.txt, Path: D:\files\M_11.txt, Size: 104857595904b
Doc 4 Name: We are young.txt, Path: D:\files\We are young.txt, Size: 104857595904b
Doc 5 Name: 什么是微博.txt, Path: D:\files\什么是微博.txt, Size: 104857595904b
Doc 6 Name: 喝水不忘挖井人.txt, Path: D:\files\喝水不忘挖井人.txt, Size: 104857595904b
Doc 7 Name: 天苍苍野茫茫.txt, Path: D:\files\天苍苍野茫茫.txt, Size: 104857595904b
Doc 8 Name: 怎么使用lucene.txt, Path: D:\files\怎么使用lucene.txt, Size: 104857595904b
Doc 9 Name: 神马是一种马吗.txt, Path: D:\files\神马是一种马吗.txt, Size: 104857595904b
Doc 10 Name: 苍井.txt, Path: D:\files\苍井.txt, Size: 104857595904b
Doc 11 Name: 苍白 - 副本.txt, Path: D:\files\苍白 - 副本.txt, Size: 104857595904b
Doc 12 Name: 苍白.txt, Path: D:\files\苍白.txt, Size: 104857595904b
===================================================================================
Searched 1 docs for id we...
Score doc for id 0 is doc=4 score=1.7948763 shardIndex=-1
===================================================================================
Searched 1docs for keyword we are yy...
Score doc 0 is doc=4 score=0.625 shardIndex=-1
Score doc 0 hightlight to: <span>We</span> are young.txt