0. 对指定目录中的文件进行索引并执行搜索
  • Lucene 版本为 8.0.0,需要 JDK 8.0 及以上版本。
  • 注意:这里在遍历文档目录时,没有采用递归函数实现,而是使用 Files 工具类,Files 工具类效率更高。而且在使用递归函数时,递归的深度,受虚拟机方法栈深度限制。简单测试了一下,我的机器递归深度为 36631。递归对比循环,成本更高。
  • 下面的索引与搜索程序的重点分别在 Field 类和 Query 类的使用。
1.1 索引程序如下:
import static org.junit.Assert.*;import java.io.File;import java.io.IOException;import java.io.InputStreamReader;import java.nio.charset.StandardCharsets;import java.nio.file.FileVisitResult;import java.nio.file.Files;import java.nio.file.LinkOption;import java.nio.file.Path;import java.nio.file.Paths;import java.nio.file.SimpleFileVisitor;import java.nio.file.StandardOpenOption;import java.nio.file.attribute.BasicFileAttributes;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.IntPoint;import org.apache.lucene.document.StringField;import org.apache.lucene.document.TextField;import org.apache.lucene.document.Field.Store;import org.apache.lucene.document.LongPoint;import org.apache.lucene.document.StoredField;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.junit.Test;/** 索引操作,这里实际索引的文件为 Java 的 JDK 中包含的源码文件  *
即 C:\Program Files\Java\jdk1.8.0_201\src.zip 解压后的文档 */public class Demo3_Indexing {
private static ThreadLocal
startTime = new ThreadLocal
(); @Test public void test_temp() throws Exception {
String title = "String.java"; System.out.println(title.substring(0, title.indexOf(".java"))); } @Test public void test_path() throws Exception {
System.out.println(Paths.get("doc", new String[0])); } private static String indexPathStr = "E:\\temp\\lucene-index"; private static Path indexPath = null; private static String docsPathStr = "E:\\temp\\lucene-docs\\src"; private static Path docsPath = null; private static File docsFile = null; private static int count = 0; private static boolean isInited_MinTileHashCodeValue = false; private static int minTitleHashCodeValue = 0; private static int maxTitleHashCodeValue = 0; private static boolean isInited_MinDocCreatedTimeValue = false; private static long minDocCreatedTimeValue = 0; private static long maxDocCreatedTimeValue = 0; private static void updateMinOrMaxTitleHashCodeValue(int hashCode) {
if (hashCode > maxTitleHashCodeValue) {
maxTitleHashCodeValue = hashCode; } if (!isInited_MinTileHashCodeValue && (isInited_MinTileHashCodeValue = true) == true) {
minTitleHashCodeValue = hashCode; } if (hashCode < minTitleHashCodeValue) {
minTitleHashCodeValue = hashCode; } } private static void updateMinOrMaxDocCreatedTimeValue(long createdTime) {
if (createdTime > maxDocCreatedTimeValue) {
maxDocCreatedTimeValue = createdTime; } if (!isInited_MinDocCreatedTimeValue && (isInited_MinDocCreatedTimeValue = true) == true) {
minDocCreatedTimeValue = createdTime; } if (createdTime < minDocCreatedTimeValue) {
minDocCreatedTimeValue = createdTime; } } static {
docsPath = Paths.get(docsPathStr, new String[0]); if (Files.notExists(docsPath, new LinkOption[] {
LinkOption.NOFOLLOW_LINKS})) {
System.out.println("指定文件目录不存在,docsPath:" + docsPathStr); System.out.println("程序退出"); System.exit(1); } docsFile = docsPath.toFile(); // init index path indexPath = Paths.get(indexPathStr, new String[0]); if (Files.notExists(indexPath, new LinkOption[] {
LinkOption.NOFOLLOW_LINKS})) {
System.out.println("(不要慌,Lucene 会为我们创建的)指定索引文件不存在,indexPath:" + indexPathStr); } } public static void recurveFiles(File file) {
if (file.isDirectory()) {
File[] files = file.listFiles(); for(File f : files) {
recurveFiles(f); } }else {
count++; } } public static void walkPaths(final IndexWriter writer, Path path){
try {
if (Files.isDirectory(path, new LinkOption[] {
LinkOption.NOFOLLOW_LINKS})) {
Files.walkFileTree(path, new SimpleFileVisitor
() {
@Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
// 跳过目录 if (!Files.isDirectory(file, new LinkOption[] {
LinkOption.NOFOLLOW_LINKS})) {
count++; doIndexing(writer, file); } return FileVisitResult.CONTINUE; } @Override public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException {
return FileVisitResult.CONTINUE; } }); }else {
// path 为一个文件,而非目录 count++; doIndexing(writer, path); } } catch (IOException e) {
e.printStackTrace(); } finally {
try {
writer.close(); } catch (IOException e) {
e.printStackTrace(); } } } public static void doIndexing(IndexWriter writer, Path filePath) {
try {
long docCreatedTime = System.currentTimeMillis(); Document document = new Document(); // field 1, 'title' - StringField // stringField,使用原始字符串值作为一个 token(词汇单元), // 在查询时,使用完全匹配,且区分大小写, // 例如,原始字符串为 ‘String’,则在查询时,使用 ‘string’则匹配不到该文本(Document), // 必须使用 ‘String’ 即和原始字符串值完全一致时,才能匹配该文本(Document) String title = filePath.getFileName().toString(); String titleNameWithoutSuffix = null; // 对 Java 源文件特殊对待 if (title.endsWith("java")) {
titleNameWithoutSuffix = title.substring(0, title.indexOf(".java")); } if (titleNameWithoutSuffix != null) {
// 1// document.add(new StringField("title", titleNameWithoutSuffix, Field.Store.YES)); // 2 document.add(new TextField("title", titleNameWithoutSuffix, Field.Store.YES)); }else {
// 1// document.add(new StringField("title", title, Field.Store.YES)); // 2 document.add(new TextField("title", title, Field.Store.YES)); } // field 2, 'createdTime' - LongPoint document.add(new LongPoint("createdTime", new long[] {
docCreatedTime})); document.add(new StoredField("createdTimeValue", docCreatedTime)); updateMinOrMaxDocCreatedTimeValue(docCreatedTime); // field 3, 'body' - TextField document.add(new TextField("body", new InputStreamReader(Files.newInputStream(filePath, StandardOpenOption.READ) , StandardCharsets.UTF_8))); document.add(new StoredField("bodyValue", Files.readAllBytes(filePath))); // field 4, hashcode of title - IntPoint int titleHashCode = (titleNameWithoutSuffix == null ? title.hashCode() : titleNameWithoutSuffix.hashCode()); document.add(new IntPoint("titleHashCode", new int[] {
titleHashCode})); document.add(new StoredField("titleHashCodeValue", titleHashCode)); updateMinOrMaxTitleHashCodeValue(titleHashCode); // field 5, path document.add(new StoredField("path", filePath.toString())); writer.addDocument(document); } catch (IOException e) {
e.printStackTrace(); } } public static void main(String[] args) throws IOException {
System.out.println("beginning..."); count = 0; long st = System.currentTimeMillis();// recurveFiles(docsFile); long et = System.currentTimeMillis();// System.out.println("recurve file cost time " + (et - st) + "ms, count is " + count); count = 0; st = System.currentTimeMillis(); // Directory directory = FSDirectory.open(indexPath); Analyzer analyzer = new StandardAnalyzer(); IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(directory, config); // walkPaths(writer, docsPath); et = System.currentTimeMillis(); System.out.println("indxing file cost time " + (et - st) + "ms, count is " + count); System.out.println("title hash code range is from " + minTitleHashCodeValue + " to " + maxTitleHashCodeValue); System.out.println("document created time range is from " + minDocCreatedTimeValue + " to " + maxDocCreatedTimeValue); } }
1.2 搜索程序如下:
import java.io.ByteArrayInputStream;import java.io.IOException;import java.io.InputStreamReader;import java.nio.charset.StandardCharsets;import java.nio.file.Files;import java.nio.file.LinkOption;import java.nio.file.Path;import java.nio.file.Paths;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.IntPoint;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.Term;import org.apache.lucene.queryparser.classic.ParseException;import org.apache.lucene.queryparser.classic.QueryParser;import org.apache.lucene.queryparser.xml.builders.PointRangeQueryBuilder;import org.apache.lucene.search.BooleanClause.Occur;import org.apache.lucene.search.BooleanQuery;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.PhraseQuery;import org.apache.lucene.search.PointRangeQuery;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;/** 搜索索引 */public class Demo3_Searching {
private static String indexPathStr = "E:\\temp\\lucene-index"; private static Path indexPath = null; static {
// init index path indexPath = Paths.get(indexPathStr, new String[0]); if (Files.notExists(indexPath, new LinkOption[] {
LinkOption.NOFOLLOW_LINKS})) {
System.out.println("指定索引文件不存在,无法执行搜索,indexPath:" + indexPathStr); System.out.println("程序退出"); System.exit(1); } } /** 解析字节流,返回其字符串表示 */ public static String getStringValueFromBytes(byte[] bytes) {
StringBuilder builder = new StringBuilder(); if (bytes != null && bytes.length > 0) {
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(bytes) , StandardCharsets.UTF_8); try {
char[] buf = new char[1024]; int res = 0; while((res = reader.read(buf)) != -1) {
builder.append(buf, 0, res); } } catch (IOException e) {
e.printStackTrace(); } } return builder.toString(); } /** 执行查询操作 */ public static void doSearching(IndexSearcher searcher, Query query, int histNum) {
try {
long startTime = System.currentTimeMillis(); TopDocs topDocs = searcher.search(query, histNum); System.out.println("搜索耗时:" + (System.currentTimeMillis() - startTime) + "毫秒"); if (topDocs != null) {
System.out.println("搜索结果为: \n--------------------"); ScoreDoc[] scoreDocs = topDocs.scoreDocs; for(ScoreDoc hit : scoreDocs) {
Document doc = searcher.doc(hit.doc); // available field: title , createdTime / createdTimeValue // , body , titleHashCode / titleHashCodeValue String title = doc.get("title"); String path = doc.get("path"); Field titleHashCodeValue = (Field) doc.getField("titleHashCodeValue"); Field bodyValue = (Field) doc.getField("bodyValue"); String bodyValueStr = getStringValueFromBytes(bodyValue.binaryValue().bytes); System.out.println("score=" + hit.score + ", title : " + title + ", hashCod=" + titleHashCodeValue.stringValue() + ", \npath : " + path); // 输出部分 body 值(因为 body 部分文本太长) System.out.println("[body]\n" + bodyValueStr.substring(0, (bodyValueStr.length() > 10 ? 10 : bodyValueStr.length()))); // 打印分割线 System.out.println("----------"); } } } catch (IOException e) {
e.printStackTrace(); } } public static void main(String[] args) throws IOException, ParseException {
Directory directory = FSDirectory.open(indexPath); IndexReader reader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new StandardAnalyzer(); int histNum = 10; // 1 该查询方式区分大小写 Query query = new TermQuery(new Term("body", "string")); // 2 查询解析器会将查询语句全部转为小写// String fieldName = "title";// QueryParser parser = new QueryParser(fieldName, analyzer);// Query query = parser.parse("String"); // 3 // phraseQuery ,使用完全匹配,并且区分大小写// Query query = new PhraseQuery("title", new String[] {"string"}); // 4 布尔查询(组合多个查询)// Query query = new BooleanQuery.Builder()// .add(new TermQuery(new Term("body", "string")), Occur.MUST)// .add(new TermQuery(new Term("body", "file")), Occur.MUST)// .build(); // 5 数值范围,精确查询// Query query = IntPoint.newRangeQuery("titleHashCode", -1808118735, -808118735); // do searching System.out.println("查询语句为:" + query); doSearching(searcher, query, histNum); }}


