lucene 3.0.2 搜索

1.lucene 词频

转载：http://mxdxm.iteye.com/blog/989031

lucene in action作为action系列，确实坚持了其实用性的特色。全书花了很大的篇幅来讲解查询的解析，结果的优化和lucene应用上。很适合要做全文检索的人学习使用。但是lucen的功能决不仅仅在做搜索引擎上。如果不是最近看到一篇介绍用lucene作词频，文档统计的文章的话，我可能到现在还在为寻找一种用于专业研究的工具而苦恼。其实lucene可以很轻松地实现信息检索课中提到的要求,例如：

* 统计，实现以下功能 *

(1) 统计term在整个collection中的文档频度(document frequency, DF)；

(2) 统计term在整个collection中出现的词次(term frequency in whole collection)；

(3) 统计term在某个文档中出现的频度(term frequency, TF)； (4) 列出term在某文档中出现的位置(position)； (5) 整个collection中文档的个数；

另一个参考：http://www.360doc.com/content/11/0427/03/1947337_112596569.shtml

package lia.meetlucene;

import java.io.File;

import java.io.IOException;

import java.util.Date;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.TermEnum;

import org.apache.lucene.index.TermPositions;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

public class Searchnum {

    //static final Log log = LogFactory.getLog(Statistic.class);

    public static void printIndex(IndexReader reader) throws Exception {

/*

        // 显示document数

         System.out.println(new Date() + "n");

         System.out.println(reader + "t该索引共含 " + reader.numDocs() + "篇文档n");

        for (int i = 0; i < reader.numDocs(); i++) {

             System.out.println("文档" + i + "：" + reader.document(i) + "n");

        }

*/

        // 枚举term，获得<document, term freq, position* >信息

        TermEnum termEnum = reader.terms();

        while (termEnum.next()) {

             System.out.println("n" + termEnum.term().field() + "域中出现的词语："

                    + termEnum.term().text());

             System.out.println(" 出现该词的文档数=" + termEnum.docFreq());

            TermPositions termPositions = reader.termPositions(termEnum.term());

            int i = 0;

            int j = 0;

            while (termPositions.next()) {

                 System.out.println("n" + (i++) + "->" + "    文章编号:"

                        + termPositions.doc() + ", 出现次数:"

                        + termPositions.freq() + "    出现位置：");

                for (j = 0; j < termPositions.freq(); j++)

                     System.out.println("[" + termPositions.nextPosition() + "]");

                 System.out.println("n");

            }

        }

    }

/*

    public static void main(String args[]) throws Exception {

        // String index = ReadConfig.getPara("indexdir");

        IndexReader reader = IndexReader.open(index);

        printIndex(reader);

    }*/

    public static void main(String[] args) throws Exception {

        if (args.length != 2) {

            throw new IllegalArgumentException("Usage: java "

                    + Searcher.class.getName() + " <index dir> <query>");

        }

        String indexDir = args[0]; // 1 索引路径

        //String q = args[1]; // 2 解析输入的查询字符串

        Directory dir = FSDirectory.open(new File(indexDir)); //3打开索引文件

        //IndexSearcher is = new IndexSearcher(dir);   //3

        IndexSearcher search = new IndexSearcher(dir);

        IndexReader reader = search.getIndexReader();

        //search(indexDir);

        System.out.println("asdfsasdfasd");

        printIndex(reader);

    }

}

结果：

n

ncontents域中出现的词语：精神

 出现该词的文档数=

n0->    文章编号:, 出现次数:    出现位置：

[]

n

ncontents域中出现的词语：繁荣

 出现该词的文档数=

n0->    文章编号:, 出现次数:    出现位置：

[]

[]

[]

n

ncontents域中出现的词语：给予

 出现该词的文档数=

n0->    文章编号:, 出现次数:    出现位置：

[]

参考：http://hanyuanbo.iteye.com/blog/812847

2.lucene 统计term（与建立索引时的分词器有关）出现的个数

package lia.meetlucene;

import java.io.File;

import java.io.IOException;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.CorruptIndexException;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.Term;

import org.apache.lucene.index.TermDocs;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

public class Searchnumber {

    public static void main(String[] args) throws CorruptIndexException,

            IOException {

        String indexDir = args[0]; // 1 索引路径

        String q = args[1]; // 2 解析输入的查询字符串

        search(indexDir, q);

    }

    public static void search(String indexDir, String keyword) {

        try {

            Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件

            IndexSearcher is = new IndexSearcher(dir, true);

            IndexReader reader = is.getIndexReader();

            int num = reader.numDocs();

            for (int i = 0; i < num; i++) {

                Document doc = reader.document(i);

                System.out.println(doc);

            }

            Term term = new Term("contents", keyword);

            TermDocs docs = reader.termDocs(term);

            while (docs.next()) {

                // System.out.println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n");

                System.out.print("doc num\t" + docs.doc() + "\t");

                System.out.println("frequency:\t" + docs.freq());

            }

            reader.close();

            is.close();

        } catch (CorruptIndexException e) {

            e.printStackTrace();

        } catch (IOException e) {

            e.printStackTrace();

        }

    }

}

结果：

Document<stored,indexed<filename:commentbyme.txt> stored,indexed<fullpath:C:\Users\Administrator\Desktop\xdj\data\commentbyme.txt>>

Document<stored,indexed<filename:gettrendweek.txt> stored,indexed<fullpath:C:\Users\Administrator\Desktop\xdj\data\gettrendweek.txt>>

Document<stored,indexed<filename:no.txt> stored,indexed<fullpath:C:\Users\Administrator\Desktop\xdj\data\no.txt>>

Document<stored,indexed<filename:showuser.txt> stored,indexed<fullpath:C:\Users\Administrator\Desktop\xdj\data\showuser.txt>>

Document<stored,indexed<filename:suggestionusermayinst.txt> stored,indexed<fullpath:C:\Users\Administrator\Desktop\xdj\data\suggestionusermayinst.txt>>

doc num    0    frequency:    15

doc num    2    frequency:    2

doc num    3    frequency:    1

doc num    4    frequency:    30

3.lucene统计多个文档中出现关键词的文档数

package lia.meetlucene;

import java.io.File;

import java.io.IOException;

import java.util.Date;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.CorruptIndexException;

import org.apache.lucene.queryParser.ParseException;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopScoreDocCollector;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

public class Searchnumbers {

    public static void main(String[] args) throws CorruptIndexException,

            IOException {

        String indexDir = args[0]; // 1 索引路径

        String q = args[1]; // 2 解析输入的查询字符串

        search(indexDir, q);

    }

    public static void search(String indexDir, String keyword) {

        try {

            Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件

            IndexSearcher is = new IndexSearcher(dir, true);

            QueryParser parser = new QueryParser(Version.LUCENE_30, "contents",

                    new StandardAnalyzer(Version.LUCENE_30));// 对Document中的哪个Field进行QueryParser

            Query query = parser.parse(keyword);

            TopScoreDocCollector collector = TopScoreDocCollector.create(100,

                    false);

            long start = new Date().getTime();

            is.search(query, collector);// IndexSearcher对Query进行索引，并将结果保存在TopScoreDocCollector中

            ScoreDoc[] hits = collector.topDocs().scoreDocs;

            System.out.println(hits.length);

            for (int i = 0; i < hits.length; i++) {

                Document doc = is.doc(hits[i].doc);

                System.out.println(doc.getField("filename") + "\t"

                        + hits[i].toString());// 得到doc中的filename的Field

            }

            is.close();

            long end = new Date().getTime();

            System.out.println("Found " + collector.getTotalHits()

                    + " document(s) (in " + (end - start)

                    + " milliseconds) that matched query '" + keyword + "'");

        } catch (CorruptIndexException e) {

            e.printStackTrace();

        } catch (IOException e) {

            e.printStackTrace();

        } catch (ParseException e) {

            e.printStackTrace();

        }

    }

}

4.lucene搜索产生的索引term（分割词）中关键词出现个数。（同2）

package lia.meetlucene;

import java.io.File;

import java.io.IOException;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.CorruptIndexException;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.Term;

import org.apache.lucene.index.TermDocs;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

public class Searchnumber {

    public static void main(String[] args) throws CorruptIndexException,

            IOException {

        String indexDir = args[0]; // 1 索引路径

        String q = args[1]; // 2 解析输入的查询字符串

        search(indexDir, q);

    }

    public static void search(String indexDir, String keyword) {

        try {

            Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件

            IndexSearcher is = new IndexSearcher(dir, true);

            IndexReader reader = is.getIndexReader();

            int num = reader.numDocs();

            for (int i = 0; i < num; i++) {

                Document doc = reader.document(i);

                System.out.println(doc);

            }

            Term term = new Term("contents", keyword);

            TermDocs docs = reader.termDocs(term);

            while (docs.next()) {

                System.out.println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n");

                System.out.print("doc num\t" + docs.doc() + "\t");

                System.out.println("frequency:\t" + docs.freq());

            }

            reader.close();

            is.close();

        } catch (CorruptIndexException e) {

            e.printStackTrace();

        } catch (IOException e) {

            e.printStackTrace();

        }

    }

}

Document<stored,indexed<filename:texthz.txt> stored,indexed<fullpath:E:\xdj\weibodata\text\texthz.txt>>

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

doc num    0    frequency:    27254

5.lucene搜索文档中是否包含关键字，打印文档名

package lia.meetlucene;

/**

 * Copyright Manning Publications Co.

 *

 * Licensed under the Apache License, Version 2.0 (the "License");

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific lan

 */

import org.apache.lucene.document.Document;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.Term;

import org.apache.lucene.index.TermEnum;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.store.Directory;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.queryParser.ParseException;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.util.Version;

import java.io.File;

import java.io.IOException;

// From chapter 1

/**

 * This code was originally written for Erik's Lucene intro java.net article

 */

public class Searcher {

    public static void main(String[] args) throws IllegalArgumentException,

            IOException, ParseException {

        if (args.length != 2) {

            throw new IllegalArgumentException("Usage: java "

                    + Searcher.class.getName() + " <index dir> <query>");

        }

        String indexDir = args[0]; // 1 索引路径

        String q = args[1]; // 2 解析输入的查询字符串

        search(indexDir, q);

    }

    public static void search(String indexDir, String q) throws IOException,

            ParseException {

        // ////////////////////////////////////////////////////////////////////////////////

        Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件

        IndexSearcher is = new IndexSearcher(dir); //

        IndexReader reader = is.getIndexReader();

        // TermEnum termEnum=reader.terms();

        // termEnum.

        /*

         * QueryParser parser = new QueryParser(Version.LUCENE_30, // 4 解析查询字符串

         * "contents", //4 new StandardAnalyzer( //4 Version.LUCENE_30)); //4

         */

        QueryParser parser = new QueryParser(Version.LUCENE_30, // 4 解析查询字符串

                "contents", //

                 new SmartChineseAnalyzer(Version.LUCENE_30)); //

        Query query = parser.parse(q); //

        long start = System.currentTimeMillis();

        TopDocs hits = is.search(query, 10); // 5 搜索索引

        // Hits hits = is.search(query);

        long end = System.currentTimeMillis();

        System.err.println("Found " + hits.totalHits + // 6记录索引状态

                " document(s) (in " + (end - start) + //

                " milliseconds) that matched query '" + // 6+reader.docFreq(new

                                                        // Term("雨"))

                q + "':");

        for (ScoreDoc scoreDoc : hits.scoreDocs) {

            Document doc = is.doc(scoreDoc.doc); // 7 返回匹配文字

            System.out.println(doc.get("fullpath") + "  " + scoreDoc.doc); // 8

                                                                            // 显示匹配文件名

        }

        // ////////////////////////////////////////////////////////////////////////////////

        is.close(); //

    }

}

/*

 * #1 Parse provided index directory #2 Parse provided query string #3 Open

 * index #4 Parse query #5 Search index #6 Write search stats

 *

 * #7 Retrieve matching document #8 Display filename #9 Close IndexSearcher

 */

Found 1 document(s) (in 16 milliseconds) that matched query '雨钝':

E:\xdj\weibodata\text\texthz.txt  0

6.查询词排序输出

package lia.meetlucene;

import java.io.File;

import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.document.Field.Index;

import org.apache.lucene.document.Field.Store;

import org.apache.lucene.index.CorruptIndexException;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriter.MaxFieldLength;

import org.apache.lucene.queryParser.ParseException;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.Searcher;

import org.apache.lucene.search.Sort;

import org.apache.lucene.search.SortField;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.search.TopFieldDocs;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.store.LockObtainFailedException;

import org.apache.lucene.util.Version;

public class Searchnumbers {

    /**

     * 建索引<br>

     * 一共4个Document，每个文档两个Field：text,size。text存放内容，size用于排序

     *

     * @throws CorruptIndexException

     * @throws LockObtainFailedException

     * @throws IOException

     */

    private static void build() throws CorruptIndexException, LockObtainFailedException, IOException {

        IndexWriter writer = new IndexWriter(FSDirectory.open(new File("index")), new StandardAnalyzer(Version.LUCENE_30), true, MaxFieldLength.LIMITED);

        Document document = new Document();

        document.add(new Field("text", "google", Store.YES, Index.ANALYZED));

        document.add(new Field("size", "1", Store.YES, Index.NOT_ANALYZED_NO_NORMS));

        writer.addDocument(document);

        document = new Document();

        document.add(new Field("text", "google earth apache", Store.YES, Index.ANALYZED));

        document.add(new Field("size", "2", Store.YES, Index.NOT_ANALYZED_NO_NORMS));

        writer.addDocument(document);

        document = new Document();

        document.add(new Field("text", "baidu earth", Store.YES, Index.ANALYZED));

        document.add(new Field("size", "3", Store.YES, Index.NOT_ANALYZED_NO_NORMS));

        writer.addDocument(document);

        document = new Document();

        document.add(new Field("text", "baidu earth apache", Store.YES, Index.ANALYZED));

        document.add(new Field("size", "4", Store.YES, Index.NOT_ANALYZED_NO_NORMS));

        writer.addDocument(document);

        writer.optimize();

        writer.close();

    }

    /**

     * lucene3.0已经没有返回Hits的方法，使用返回TopDocs的方法进行搜索

     *

     * @param keyword

     *            要搜索的关键词

     * @throws CorruptIndexException

     * @throws IOException

     * @throws ParseException

     */

    private static void searchWithTopDocs(String keyword) throws CorruptIndexException, IOException, ParseException {

        QueryParser parser = new QueryParser(Version.LUCENE_30, "text", new StandardAnalyzer(Version.LUCENE_30));

        IndexSearcher searcher = new IndexSearcher(FSDirectory.open(new File("index")));

        TopDocs topDocs = searcher.search(parser.parse(keyword), 10);// 取前10个搜索结果，如果没有这么多，就取实际大小

        ScoreDoc[] scoreDocs = topDocs.scoreDocs;// 获取ScoreDoc

        System.out.println("hits:" + topDocs.totalHits);

        for (ScoreDoc scoreDoc : scoreDocs) {

            int docNum = scoreDoc.doc;// 文档编号

            Document doc = searcher.doc(docNum);

            String text = doc.get("text");

            String size = doc.get("size");

            float score = scoreDoc.score;// 评分

            System.out.println(text + " " + size + " " + score);

        }

    }

    /**

     * 对命中文档进行排序的搜索，也不再返回Hits，而是返回TopFieldDocs

     *

     * @param keyword

     *            要搜索的关键词

     * @throws CorruptIndexException

     * @throws IOException

     * @throws ParseException

     */

    private static void searchWithSort(String keyword) throws CorruptIndexException, IOException, ParseException {

        QueryParser parser = new QueryParser(Version.LUCENE_30, "text", new StandardAnalyzer(Version.LUCENE_30));

        Searcher searcher = new IndexSearcher(FSDirectory.open(new File("index")));

        Query query = parser.parse(keyword);

        SortField sortField = new SortField("size", SortField.INT, true);// 需要排序的字段

        TopFieldDocs topFieldDocs = searcher.search(query, null, 10, new Sort(sortField));// 第二个参数是过滤器，此例中不需要

        ScoreDoc[] socDocs = topFieldDocs.scoreDocs;

        System.out.println("hits:" + topFieldDocs.totalHits);

        for (ScoreDoc scoreDoc : socDocs) {

            int docNum = scoreDoc.doc;

            Document doc = searcher.doc(docNum);

            String text = doc.get("text");

            String size = doc.get("size");

            float score = scoreDoc.score;// 评分,这里的评分不可用,分值都是NaN

            System.out.println(text + " " + size + " " + score);

        }

    }

    public static void main(String[] args) throws CorruptIndexException, LockObtainFailedException, IOException, ParseException {

        build();

        String keyword = "google";

        searchWithTopDocs(keyword);

        System.out.println("---------");

        searchWithSort(keyword);

    }

}

hits:2

google 1 1.287682

google earth apache 2 0.643841

---------

hits:2

google earth apache 2 NaN

google 1 NaN

7.lucene关键字的高亮显示

package lia.meetlucene;

import java.io.File;

import java.io.IOException;

import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.TokenStream;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;

import org.apache.lucene.analysis.tokenattributes.TermAttribute;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.document.Field.Index;

import org.apache.lucene.document.Field.Store;

import org.apache.lucene.index.CorruptIndexException;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriter.MaxFieldLength;

import org.apache.lucene.queryParser.ParseException;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.search.highlight.Highlighter;

import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;

import org.apache.lucene.search.highlight.QueryScorer;

import org.apache.lucene.search.highlight.SimpleFragmenter;

import org.apache.lucene.search.highlight.SimpleHTMLFormatter;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.store.LockObtainFailedException;

import org.apache.lucene.util.Version;

public class Searchnum {

    /**

     * lucene3.0开始已经抛弃了原来的分词方式，转而使用新的分词方式<br>

     * 本方法以SmartChineseAnalyzer为例，演示如何分词以及取得分词之后的term

     *

     * @throws Exception

     */

    public static void analysis() throws Exception {

        Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_30);

        String string = "中国人民银行采取了一系列措施防止人民币升值，但是很遗憾，这些措施在今天看来其作用是微乎其微的。难道真的就没有什么别的措施防止人民币再次疯狂升值了吗?";

        StringReader reader = new StringReader(string);

        TokenStream ts = analyzer.tokenStream("", reader);

        TermAttribute termAttribute = ts.getAttribute(TermAttribute.class);

        while (ts.incrementToken()) {

            System.out.print(termAttribute.term() + "  ");

        }

        System.out.println();

    }

    /**

     * 建索引<br>

     * 在构造IndexWriter时必须使用Directory作为参数了

     *

     * @throws CorruptIndexException

     * @throws LockObtainFailedException

     * @throws IOException

     */

    private static void build() throws CorruptIndexException, LockObtainFailedException, IOException {

        String path = "index";

        IndexWriter writer = new IndexWriter(FSDirectory.open(new File(path)), new SmartChineseAnalyzer(Version.LUCENE_30), true, MaxFieldLength.LIMITED);

        Document document = new Document();

        document.add(new Field("text", "中国人民银行采取了一系列措施防止人民币升值，但是很遗憾，这些措施在今天看来其作用是微乎其微的。难道真的就没有什么别的措施防止人民币再次疯狂升值了吗？", Store.YES, Index.ANALYZED));

        writer.addDocument(document);

        writer.optimize();

        writer.close();

    }

    /**

     * 搜索也没有返回Hits类型结果的方法了

     *

     * @param keyword

     * @throws CorruptIndexException

     * @throws IOException

     * @throws ParseException

     * @throws InvalidTokenOffsetsException

     */

    private static void search(String keyword) throws CorruptIndexException, IOException, ParseException, InvalidTokenOffsetsException {

        Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_30);

        QueryParser parser = new QueryParser(Version.LUCENE_30, "text", analyzer);

        IndexSearcher searcher = new IndexSearcher(FSDirectory.open(new File("index")));

        Query query = parser.parse(keyword);

        System.out.println(query);

        TopDocs topDocs = searcher.search(query, 10);

        ScoreDoc[] scoreDocs = topDocs.scoreDocs;

        System.out.println("hits:" + topDocs.totalHits);

        for (ScoreDoc scoreDoc : scoreDocs) {

            Document doc = searcher.doc(scoreDoc.doc);

            String text = doc.get("text");

            System.out.println(highlight(text, query, analyzer));

        }

    }

    /**

     * 高亮关键词

     *

     * @param content

     *            需要高亮的内容

     * @param query

     *            搜索时使用的Query对象

     * @param analyzer

     *            分词器

     * @return 高亮之后的文本

     * @throws IOException

     * @throws InvalidTokenOffsetsException

     */

    private static String highlight(String content, Query query, Analyzer analyzer) throws IOException, InvalidTokenOffsetsException {

        SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<b>", "</b>");

        Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));

        highlighter.setTextFragmenter(new SimpleFragmenter(25));

        String resultString = highlighter.getBestFragment(analyzer.tokenStream("", new StringReader(content)), content);

        return resultString + "...";

    }

    public static void main(String[] args) throws Exception {

        analysis();

        build();

        search("人民币");

    }

}

中国  人民  银行  采取  了  一  系列  措施  防止  人民币  升值  但是  很  遗憾  这些  措施  在  今天  看来  其  作用  是  微乎其微  的  难道  真  的  就  没有  什么  别的  措施  防止  人民币  再次  疯狂  升值  了  吗

text:人民币

hits:1

中国人民银行采取了一系列措施防止<b>人民币</b>升值，但是...

8.输出分词后，出现次数较多的前1000个terms

package lia.meetlucene;

import java.io.File;

import java.util.ArrayList;

import java.util.Collections;

import java.util.Comparator;

import java.util.List;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.TermEnum;

import org.apache.lucene.index.TermPositions;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

public class Searchnumbers {

    // static final Log log = LogFactory.getLog(Statistic.class);

    public static void printIndex(IndexReader reader) throws Exception {

        /*

         * // 显示document数 System.out.println(new Date() + "n");

         * System.out.println(reader + "t该索引共含 " + reader.numDocs() + "篇文档n");

         *

         * for (int i = 0; i < reader.numDocs(); i++) { System.out.println("文档"

         * + i + "：" + reader.document(i) + "n"); }

         */

        // 枚举term，获得<document, term freq, position* >信息

        TermEnum termEnum = reader.terms();

        // List ve = new List();

        List<Person_Term> listA = new ArrayList<Person_Term>();

        while (termEnum.next()) {

            Person_Term pa = new Person_Term();

            /*

             * System.out.println("n" + termEnum.term().field() + "域中出现的词语：" +

             * termEnum.term().text());

             */

            pa.setterm(termEnum.term().text());

            TermPositions termPositions = reader.termPositions(termEnum.term());

            int i = 0;

            int j = 0;

            // while (termPositions.next())

            {

                /*

                 * System.out.println("n" + (i++) + "->" + "    文章编号:" +

                 * termPositions.doc() + ", 出现次数:" + termPositions.freq() +

                 * "    出现位置："); for (j = 0; j < termPositions.freq(); j++)

                 * System.out.println("[" + termPositions.nextPosition() + "]");

                 * System.out.println("n");

                 */

                termPositions.next();

                pa.setfreq(termPositions.freq());

                // System.out.println(termPositions.);

            }

            listA.add(pa);

        }

        Collections.sort(listA, new Comparator<Person_Term>() {

            public int compare(Person_Term arg0, Person_Term arg1) {

                return arg1.getfreq().compareTo(arg0.getfreq());

            }

        });

        int i = 0;

        for (Person_Term p : listA) {

            i++;

            System.out.println(p.getterm() + "\t" + p.getfreq());

            if (i > 1000)

                break;

        }

    }

    /*

     * public static void main(String args[]) throws Exception { // String index

     * = ReadConfig.getPara("indexdir");

     *

     * IndexReader reader = IndexReader.open(index); printIndex(reader);

     *

     * }

     */

    public static void main(String[] args) throws Exception {

        if (args.length != 2) {

            throw new IllegalArgumentException("Usage: java "

                    + Searcher.class.getName() + " <index dir> <query>");

        }

        String indexDir = "C:/Users/Administrator/Desktop/xdj/suoyin";// args[0];

                                                                        // // 1

                                                                        // 索引路径

        // String indexDir = "E:/xiaodajun/new/lia2e/src/lia/meetlucene";

        // String q = args[1]; // 2 解析输入的查询字符串

        Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件

        // IndexSearcher is = new IndexSearcher(dir); //

        IndexSearcher search = new IndexSearcher(dir);

        IndexReader reader = search.getIndexReader();

        // search(indexDir);

        // System.out.println("asdfsasdfasd");

        printIndex(reader);

    }

}

Person_Term.java的代码：

package lia.meetlucene;

public class Person_Term implements Comparable<Person_Term> {

    private String term;

    private Integer freq;

    /**

     *

     * @return the term

     */

    public String getterm() {

        return term;

    }

    /**

     *

     * @param term

     *

     *            the term to set

     */

    public void setterm(String term) {

        this.term = term;

    }

    /**

     *

     * @return the freq

     */

    public Integer getfreq() {

        return freq;

    }

    /**

     *

     * @param freq

     *

     *            the freq to set

     */

    public void setfreq(Integer freq) {

        this.freq = freq;

    }

    @Override

    public int compareTo(Person_Term arg0) {

        return this.getfreq().compareTo(arg0.getfreq());

    }

}

/*

 * TermPositions termPositions = reader.termPositions(termEnum.term()); int i =

 * 0; int j = 0; while (termPositions.next()) { System.out.println("n" + (i++) +

 * "->" + "    文章编号:" + termPositions.doc() + ", 出现次数:" + termPositions.freq() +

 * "    出现位置："); for (j = 0; j < termPositions.freq(); j++)

 * System.out.println("[" + termPositions.nextPosition() + "]");

 * System.out.println("n"); }

 */

输出（部分）：

文化    19

的    16

和    9

刘    8

中国    7

云    7

在    7

先达    3

好    3

委员    3

家中    3

就    3

C:\Users\Administrator\Desktop\xdj\weibohanzi\weibo.txt    1

9.多条件搜索

package lia.meetlucene;

/**

 * Copyright Manning Publications Co.

 *

 * Licensed under the Apache License, Version 2.0 (the "License");

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific lan

 */

import org.apache.lucene.document.Document;

import org.apache.lucene.document.NumberTools;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.Term;

import org.apache.lucene.index.TermEnum;

import org.apache.lucene.search.Filter;

import org.apache.lucene.search.FilteredQuery;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.NumericRangeQuery;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.RangeFilter;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.Sort;

import org.apache.lucene.search.SortField;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.store.Directory;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.queryParser.ParseException;

import org.apache.lucene.analysis.cjk.CJKAnalyzer;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.util.Version;

import java.io.File;

import java.io.FileFilter;

import java.io.IOException;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.List;

// From chapter 1

/**

 * This code was originally written for Erik's Lucene intro java.net article

 */

public class Searcherw {

    public static void main(String[] args) throws IllegalArgumentException,

            IOException, ParseException {

        // String indexDir = args[0]; //1 索引路径

        //String indexDir = "C:/Users/Administrator/Desktop/xdj/suoyin";

        String indexDir = "E:/xdj/tengxunsuoying";

        String q = "雨天";// args[1]; //2 解析输入的查询字符串

        search(indexDir, q);

    }

    public static void search(String indexDir, String q) throws IOException,

            ParseException {

        // ////////////////////////////////////////////////////////////////////////////////

        Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件

        IndexSearcher is = new IndexSearcher(dir); //

        IndexReader reader = is.getIndexReader();

        // TermEnum termEnum=reader.terms();

        // termEnum.

        QueryParser parser = new QueryParser(Version.LUCENE_30, // 4 解析查询字符串

                "context", // 4

                //new StandardAnalyzer(Version.LUCENE_30)); // 4

                //new CJKAnalyzer(Version.LUCENE_30));

                new SmartChineseAnalyzer(Version.LUCENE_30));

//        SimpleDateFormat sdf  =   new  SimpleDateFormat( " yyyy-MM-dd" ); 

  //       FileFilter filter = new FileFilter("time", datetime.parse(”2005-10-1′), datetime.parse(”2005-10-30′));

        Filter filter = new RangeFilter("time", "20141001", "20141031", true, true);

        // NumericRangeQuery rangeQuery = NumericRangeQuery.("carPrice",st,ed,true, true);  

        Query query = parser.parse(q); //

        query = new FilteredQuery(query, filter); // 带过滤条件的搜索

        long start = System.currentTimeMillis();

        //is.search(query, filter, n, sort)

        //TopDocs hits = is.search(query, 10,new Sort(new SortField("time",SortField.STRING,true))); // 5 搜索索引

        TopDocs hits = is.search(query, 10); // 5 搜索索引

        long end = System.currentTimeMillis();

        // System.err

        System.err.println("Found " + hits.totalHits + // 6记录索引状态

                " document(s) (in " + (end - start) + //

                " milliseconds) that matched query '" + // 6+reader.docFreq(new

                                                        // Term("雨"))

                q + "':");

        for (ScoreDoc scoreDoc : hits.scoreDocs) {

            Document doc = is.doc(scoreDoc.doc); // 7 返回匹配文字

            System.out.println(doc.get("time") + "  "+doc.get("context")); // 8

                                                                            // 显示匹配文件名

        }

        // ////////////////////////////////////////////////////////////////////////////////

        is.close(); //

    }

}

/*

 *

public  List<Document> rangeSearch(){

         List<Document> docList = new ArrayList<Document>();

         Double start = 20.0;

         Double end = 40.0;

        NumericRangeQuery rangeQuery = NumericRangeQuery.newDoubleRange("carPrice",start,end,true, true);

        try {

         directory = FSDirectory.open(new File(LuceneConstant.INDEX_PATH));//打开索引文件

         IndexReader reader = DirectoryReader.open(directory);//读取目录

         IndexSearcher search = new IndexSearcher(reader);//初始化查询组件

         TopDocs td = search.search(rangeQuery, 10000);//获取匹配上元素的一个docid

         for (ScoreDoc doc : td.scoreDocs) {

             docList.add(search.doc(doc.doc));

         }

         reader.close();//关闭资源

         directory.close();//关闭连接

     } catch (IOException ex) {

         Logger.getLogger(LuceneDao.class.getName()).log(Level.SEVERE, null, ex);

  }

 }  

 *

 *

 *

 */

秒客网

lucene 3.0.2 搜索

相关文章