本文共 12604 字,大约阅读时间需要 42 分钟。
import static org.junit.Assert.*;import java.io.File;import java.io.IOException;import java.io.InputStreamReader;import java.nio.charset.StandardCharsets;import java.nio.file.FileVisitResult;import java.nio.file.Files;import java.nio.file.LinkOption;import java.nio.file.Path;import java.nio.file.Paths;import java.nio.file.SimpleFileVisitor;import java.nio.file.StandardOpenOption;import java.nio.file.attribute.BasicFileAttributes;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.IntPoint;import org.apache.lucene.document.StringField;import org.apache.lucene.document.TextField;import org.apache.lucene.document.Field.Store;import org.apache.lucene.document.LongPoint;import org.apache.lucene.document.StoredField;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.junit.Test;/** 索引操作,这里实际索引的文件为 Java 的 JDK 中包含的源码文件 * 即 C:\Program Files\Java\jdk1.8.0_201\src.zip 解压后的文档 */public class Demo3_Indexing { private static ThreadLocalstartTime = new ThreadLocal (); @Test public void test_temp() throws Exception { String title = "String.java"; System.out.println(title.substring(0, title.indexOf(".java"))); } @Test public void test_path() throws Exception { System.out.println(Paths.get("doc", new String[0])); } private static String indexPathStr = "E:\\temp\\lucene-index"; private static Path indexPath = null; private static String docsPathStr = "E:\\temp\\lucene-docs\\src"; private static Path docsPath = null; private static File docsFile = null; private static int count = 0; private static boolean isInited_MinTileHashCodeValue = false; private static int minTitleHashCodeValue = 0; private static int maxTitleHashCodeValue = 0; private static boolean isInited_MinDocCreatedTimeValue = false; private static long minDocCreatedTimeValue = 0; private static long maxDocCreatedTimeValue = 0; private static void updateMinOrMaxTitleHashCodeValue(int hashCode) { if (hashCode > maxTitleHashCodeValue) { maxTitleHashCodeValue = hashCode; } if (!isInited_MinTileHashCodeValue && (isInited_MinTileHashCodeValue = true) == true) { minTitleHashCodeValue = hashCode; } if (hashCode < minTitleHashCodeValue) { minTitleHashCodeValue = hashCode; } } private static void updateMinOrMaxDocCreatedTimeValue(long createdTime) { if (createdTime > maxDocCreatedTimeValue) { maxDocCreatedTimeValue = createdTime; } if (!isInited_MinDocCreatedTimeValue && (isInited_MinDocCreatedTimeValue = true) == true) { minDocCreatedTimeValue = createdTime; } if (createdTime < minDocCreatedTimeValue) { minDocCreatedTimeValue = createdTime; } } static { docsPath = Paths.get(docsPathStr, new String[0]); if (Files.notExists(docsPath, new LinkOption[] { LinkOption.NOFOLLOW_LINKS})) { System.out.println("指定文件目录不存在,docsPath:" + docsPathStr); System.out.println("程序退出"); System.exit(1); } docsFile = docsPath.toFile(); // init index path indexPath = Paths.get(indexPathStr, new String[0]); if (Files.notExists(indexPath, new LinkOption[] { LinkOption.NOFOLLOW_LINKS})) { System.out.println("(不要慌,Lucene 会为我们创建的)指定索引文件不存在,indexPath:" + indexPathStr); } } public static void recurveFiles(File file) { if (file.isDirectory()) { File[] files = file.listFiles(); for(File f : files) { recurveFiles(f); } }else { count++; } } public static void walkPaths(final IndexWriter writer, Path path){ try { if (Files.isDirectory(path, new LinkOption[] { LinkOption.NOFOLLOW_LINKS})) { Files.walkFileTree(path, new SimpleFileVisitor () { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { // 跳过目录 if (!Files.isDirectory(file, new LinkOption[] { LinkOption.NOFOLLOW_LINKS})) { count++; doIndexing(writer, file); } return FileVisitResult.CONTINUE; } @Override public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException { return FileVisitResult.CONTINUE; } }); }else { // path 为一个文件,而非目录 count++; doIndexing(writer, path); } } catch (IOException e) { e.printStackTrace(); } finally { try { writer.close(); } catch (IOException e) { e.printStackTrace(); } } } public static void doIndexing(IndexWriter writer, Path filePath) { try { long docCreatedTime = System.currentTimeMillis(); Document document = new Document(); // field 1, 'title' - StringField // stringField,使用原始字符串值作为一个 token(词汇单元), // 在查询时,使用完全匹配,且区分大小写, // 例如,原始字符串为 ‘String’,则在查询时,使用 ‘string’则匹配不到该文本(Document), // 必须使用 ‘String’ 即和原始字符串值完全一致时,才能匹配该文本(Document) String title = filePath.getFileName().toString(); String titleNameWithoutSuffix = null; // 对 Java 源文件特殊对待 if (title.endsWith("java")) { titleNameWithoutSuffix = title.substring(0, title.indexOf(".java")); } if (titleNameWithoutSuffix != null) { // 1// document.add(new StringField("title", titleNameWithoutSuffix, Field.Store.YES)); // 2 document.add(new TextField("title", titleNameWithoutSuffix, Field.Store.YES)); }else { // 1// document.add(new StringField("title", title, Field.Store.YES)); // 2 document.add(new TextField("title", title, Field.Store.YES)); } // field 2, 'createdTime' - LongPoint document.add(new LongPoint("createdTime", new long[] { docCreatedTime})); document.add(new StoredField("createdTimeValue", docCreatedTime)); updateMinOrMaxDocCreatedTimeValue(docCreatedTime); // field 3, 'body' - TextField document.add(new TextField("body", new InputStreamReader(Files.newInputStream(filePath, StandardOpenOption.READ) , StandardCharsets.UTF_8))); document.add(new StoredField("bodyValue", Files.readAllBytes(filePath))); // field 4, hashcode of title - IntPoint int titleHashCode = (titleNameWithoutSuffix == null ? title.hashCode() : titleNameWithoutSuffix.hashCode()); document.add(new IntPoint("titleHashCode", new int[] { titleHashCode})); document.add(new StoredField("titleHashCodeValue", titleHashCode)); updateMinOrMaxTitleHashCodeValue(titleHashCode); // field 5, path document.add(new StoredField("path", filePath.toString())); writer.addDocument(document); } catch (IOException e) { e.printStackTrace(); } } public static void main(String[] args) throws IOException { System.out.println("beginning..."); count = 0; long st = System.currentTimeMillis();// recurveFiles(docsFile); long et = System.currentTimeMillis();// System.out.println("recurve file cost time " + (et - st) + "ms, count is " + count); count = 0; st = System.currentTimeMillis(); // Directory directory = FSDirectory.open(indexPath); Analyzer analyzer = new StandardAnalyzer(); IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(directory, config); // walkPaths(writer, docsPath); et = System.currentTimeMillis(); System.out.println("indxing file cost time " + (et - st) + "ms, count is " + count); System.out.println("title hash code range is from " + minTitleHashCodeValue + " to " + maxTitleHashCodeValue); System.out.println("document created time range is from " + minDocCreatedTimeValue + " to " + maxDocCreatedTimeValue); } }
import java.io.ByteArrayInputStream;import java.io.IOException;import java.io.InputStreamReader;import java.nio.charset.StandardCharsets;import java.nio.file.Files;import java.nio.file.LinkOption;import java.nio.file.Path;import java.nio.file.Paths;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.IntPoint;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.Term;import org.apache.lucene.queryparser.classic.ParseException;import org.apache.lucene.queryparser.classic.QueryParser;import org.apache.lucene.queryparser.xml.builders.PointRangeQueryBuilder;import org.apache.lucene.search.BooleanClause.Occur;import org.apache.lucene.search.BooleanQuery;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.PhraseQuery;import org.apache.lucene.search.PointRangeQuery;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;/** 搜索索引 */public class Demo3_Searching { private static String indexPathStr = "E:\\temp\\lucene-index"; private static Path indexPath = null; static { // init index path indexPath = Paths.get(indexPathStr, new String[0]); if (Files.notExists(indexPath, new LinkOption[] { LinkOption.NOFOLLOW_LINKS})) { System.out.println("指定索引文件不存在,无法执行搜索,indexPath:" + indexPathStr); System.out.println("程序退出"); System.exit(1); } } /** 解析字节流,返回其字符串表示 */ public static String getStringValueFromBytes(byte[] bytes) { StringBuilder builder = new StringBuilder(); if (bytes != null && bytes.length > 0) { InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(bytes) , StandardCharsets.UTF_8); try { char[] buf = new char[1024]; int res = 0; while((res = reader.read(buf)) != -1) { builder.append(buf, 0, res); } } catch (IOException e) { e.printStackTrace(); } } return builder.toString(); } /** 执行查询操作 */ public static void doSearching(IndexSearcher searcher, Query query, int histNum) { try { long startTime = System.currentTimeMillis(); TopDocs topDocs = searcher.search(query, histNum); System.out.println("搜索耗时:" + (System.currentTimeMillis() - startTime) + "毫秒"); if (topDocs != null) { System.out.println("搜索结果为: \n--------------------"); ScoreDoc[] scoreDocs = topDocs.scoreDocs; for(ScoreDoc hit : scoreDocs) { Document doc = searcher.doc(hit.doc); // available field: title , createdTime / createdTimeValue // , body , titleHashCode / titleHashCodeValue String title = doc.get("title"); String path = doc.get("path"); Field titleHashCodeValue = (Field) doc.getField("titleHashCodeValue"); Field bodyValue = (Field) doc.getField("bodyValue"); String bodyValueStr = getStringValueFromBytes(bodyValue.binaryValue().bytes); System.out.println("score=" + hit.score + ", title : " + title + ", hashCod=" + titleHashCodeValue.stringValue() + ", \npath : " + path); // 输出部分 body 值(因为 body 部分文本太长) System.out.println("[body]\n" + bodyValueStr.substring(0, (bodyValueStr.length() > 10 ? 10 : bodyValueStr.length()))); // 打印分割线 System.out.println("----------"); } } } catch (IOException e) { e.printStackTrace(); } } public static void main(String[] args) throws IOException, ParseException { Directory directory = FSDirectory.open(indexPath); IndexReader reader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new StandardAnalyzer(); int histNum = 10; // 1 该查询方式区分大小写 Query query = new TermQuery(new Term("body", "string")); // 2 查询解析器会将查询语句全部转为小写// String fieldName = "title";// QueryParser parser = new QueryParser(fieldName, analyzer);// Query query = parser.parse("String"); // 3 // phraseQuery ,使用完全匹配,并且区分大小写// Query query = new PhraseQuery("title", new String[] {"string"}); // 4 布尔查询(组合多个查询)// Query query = new BooleanQuery.Builder()// .add(new TermQuery(new Term("body", "string")), Occur.MUST)// .add(new TermQuery(new Term("body", "file")), Occur.MUST)// .build(); // 5 数值范围,精确查询// Query query = IntPoint.newRangeQuery("titleHashCode", -1808118735, -808118735); // do searching System.out.println("查询语句为:" + query); doSearching(searcher, query, histNum); }}
转载地址:http://djlsi.baihongyu.com/