Lucene索引文件是一种特殊的文件格式,它用于存储和检索文本数据。它是一个高性能、可扩展的全文搜索引擎,可以快速地处理大量文本数据。Lucene索引文件的核心是一个叫做“索引”的数据库,它包含了所有被索引的文本数据。
Lucene索引文件由三个部分组成:字典、倒排列表和正向列表。字典是一个单词列表,它包含了所有被索引的单词;倒排列表是一个单词-文章ID对应表,它包含了所有被索引的单词出现在哪些文章中;正向列表是一个文章ID-单词对应表,它包含了所有被索引的文章中出现过的单词。
// 字典 [word1, word2, word3, ...] // 倒排列表 [ {word1: [docId1, docId2, ...]}, {word2: [docId3, docId4, ...]}, {word3: [docId5, docId6, ...]}, ... ] // 正向列表 [ {docId1: [word1, word2, ...]}, {docId2: [word3, word4, ...]}, {docId3: [word5, word6, ...]}, ... ]
索引是识别文档并为搜索准备文档的过程。
下表列出了索引过程中常用的类。
类 | 描述 |
---|---|
IndexWriter | 在索引过程中创建/更新索引。 |
Directory | 表示索引的存储位置。 |
Analyzer | 分析文档并从文本中获取标记/单词。 |
Document | 带有字段的虚拟文档。分析仪可以处理文档。 |
Field | 索引过程的最低单位。它表示键值对,其中键用于标识索引值。 |
以下代码显示了如何使用Lucene索引文本文件。
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.LongField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.util.Date; public class Main { private Main() {} public static void main(String[] args) { String usage = "java IndexFiles" + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]nn" + "This indexes the documents in DOCS_PATH, creating a Lucene index" + "in INDEX_PATH that can be searched with SearchFiles"; String indexPath = "index"; String docsPath = null; boolean create = true; for(int i=0;i<args.length;i++) { if ("-index".equals(args[i])) { indexPath = args[i+1]; i++; } else if ("-docs".equals(args[i])) { docsPath = args[i+1]; i++; } else if ("-update".equals(args[i])) { create = false; } } if (docsPath == null) { System.err.println("Usage: " + usage); System.exit(1); } final File docDir = new File(docsPath); if (!docDir.exists() || !docDir.canRead()) { System.out.println("Document directory "" +docDir.getAbsolutePath()+ "" does not exist or is not readable, please check the path"); System.exit(1); } Date start = new Date(); try { System.out.println("Indexing to directory "" + indexPath + ""..."); Directory dir = FSDirectory.open(new File(indexPath)); // :Post-Release-Update-Version.LUCENE_XY: Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_10_0); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_10_0, analyzer); if (create) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, docDir); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it"s only // worth it when your index is relatively static (ie // you"re done adding documents to it): // // writer.forceMerge(1); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "n with message: " + e.getMessage()); } } static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn"t help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don"t tokenize // the field into separate words and don"t index term frequency // or positional information: Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that"s not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } } }
Java格式 -Java 打印样式格式 java.util.Formatter 类支持printf样式格式化。printf样式格式化是C编程语言的良好支持。以下代码...
Java正则表达式教程 -Java正则表达式匹配 Matcher 类对字符序列执行匹配通过解释在 Pattern 对象中定义的编译模式。 Pattern 类...
Java反射 -Java类反射我们可以使用Java反射来获取关于类的信息,例如作为其包名称,其访问修饰符等。要获得简单的类名,请使用 C...
Java线程教程 -Java线程休眠Thread类包含一个静态sleep()方法,它使线程在指定的持续时间内休眠。Thread.sleep()方法接受超时作...