/* * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using Directory = Lucene.Net.Store.Directory; using IndexOutput = Lucene.Net.Store.IndexOutput; using RAMOutputStream = Lucene.Net.Store.RAMOutputStream; namespace Lucene.Net.Index { /// The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add}, /// into a single Segment. After adding the appropriate readers, call the merge method to combine the /// segments. ///

/// If the compoundFile flag is set, then the segments will be merged into a compound file. /// /// ///

/// /// /// /// sealed public class SegmentMerger { private void InitBlock() { termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL; } private Directory directory; private System.String segment; private int termIndexInterval; private System.Collections.ArrayList readers = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10)); private FieldInfos fieldInfos; // File extensions of old-style index files private static readonly System.String[] COMPOUND_EXTENSIONS = new System.String[]{"fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"}; private static readonly System.String[] VECTOR_EXTENSIONS = new System.String[]{"tvx", "tvd", "tvf"}; /// This ctor used only by test code. /// /// /// The Directory to merge the other segments into /// /// The name of the new segment /// public /*internal*/ SegmentMerger(Directory dir, System.String name) { InitBlock(); directory = dir; segment = name; } internal SegmentMerger(IndexWriter writer, System.String name) { InitBlock(); directory = writer.GetDirectory(); segment = name; termIndexInterval = writer.GetTermIndexInterval(); } /// Add an IndexReader to the collection of readers that are to be merged /// reader /// public /*internal*/ void Add(IndexReader reader) { readers.Add(reader); } /// /// The index of the reader to return /// /// The ith reader to be merged /// internal IndexReader SegmentReader(int i) { return (IndexReader) readers[i]; } /// Merges the readers specified by the {@link #add} method into the directory passed to the constructor /// The number of documents that were merged /// /// IOException public /*internal*/ int Merge() { int value_Renamed; value_Renamed = MergeFields(); MergeTerms(); MergeNorms(); if (fieldInfos.HasVectors()) MergeVectors(); return value_Renamed; } /// close all IndexReaders that have been added. /// Should not be called before merge(). /// /// IOException public /*internal*/ void CloseReaders() { for (int i = 0; i < readers.Count; i++) { // close readers IndexReader reader = (IndexReader) readers[i]; reader.Close(); } } internal System.Collections.ArrayList CreateCompoundFile(System.String fileName) { CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, fileName); System.Collections.ArrayList files = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(COMPOUND_EXTENSIONS.Length + fieldInfos.Size())); // Basic files for (int i = 0; i < COMPOUND_EXTENSIONS.Length; i++) { files.Add(segment + "." + COMPOUND_EXTENSIONS[i]); } // Field norm files for (int i = 0; i < fieldInfos.Size(); i++) { FieldInfo fi = fieldInfos.FieldInfo(i); if (fi.isIndexed) { files.Add(segment + ".f" + i); } } // Vector files if (fieldInfos.HasVectors()) { for (int i = 0; i < VECTOR_EXTENSIONS.Length; i++) { files.Add(segment + "." + VECTOR_EXTENSIONS[i]); } } // Now merge all added files System.Collections.IEnumerator it = files.GetEnumerator(); while (it.MoveNext()) { cfsWriter.AddFile((System.String) it.Current); } // Perform the merge cfsWriter.Close(); return files; } /// /// The number of documents in all of the readers /// /// IOException private int MergeFields() { fieldInfos = new FieldInfos(); // merge Field names int docCount = 0; for (int i = 0; i < readers.Count; i++) { IndexReader reader = (IndexReader) readers[i]; fieldInfos.AddIndexed(reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true); fieldInfos.AddIndexed(reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false); fieldInfos.AddIndexed(reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true); fieldInfos.AddIndexed(reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false); fieldInfos.AddIndexed(reader.GetFieldNames(IndexReader.FieldOption.INDEXED), false, false, false); fieldInfos.Add(reader.GetFieldNames(IndexReader.FieldOption.UNINDEXED), false); } fieldInfos.Write(directory, segment + ".fnm"); FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos); try { for (int i = 0; i < readers.Count; i++) { IndexReader reader = (IndexReader) readers[i]; int maxDoc = reader.MaxDoc(); for (int j = 0; j < maxDoc; j++) if (!reader.IsDeleted(j)) { // skip deleted docs fieldsWriter.AddDocument(reader.Document(j)); docCount++; } } } finally { fieldsWriter.Close(); } return docCount; } /// Merge the TermVectors from each of the segments into the new one. /// IOException private void MergeVectors() { TermVectorsWriter termVectorsWriter = new TermVectorsWriter(directory, segment, fieldInfos); try { for (int r = 0; r < readers.Count; r++) { IndexReader reader = (IndexReader) readers[r]; int maxDoc = reader.MaxDoc(); for (int docNum = 0; docNum < maxDoc; docNum++) { // skip deleted docs if (reader.IsDeleted(docNum)) continue; termVectorsWriter.AddAllDocVectors(reader.GetTermFreqVectors(docNum)); } } } finally { termVectorsWriter.Close(); } } private IndexOutput freqOutput = null; private IndexOutput proxOutput = null; private TermInfosWriter termInfosWriter = null; private int skipInterval; private SegmentMergeQueue queue = null; private void MergeTerms() { try { freqOutput = directory.CreateOutput(segment + ".frq"); proxOutput = directory.CreateOutput(segment + ".prx"); termInfosWriter = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval); skipInterval = termInfosWriter.skipInterval; queue = new SegmentMergeQueue(readers.Count); MergeTermInfos(); } finally { if (freqOutput != null) freqOutput.Close(); if (proxOutput != null) proxOutput.Close(); if (termInfosWriter != null) termInfosWriter.Close(); if (queue != null) queue.Close(); } } private void MergeTermInfos() { int base_Renamed = 0; for (int i = 0; i < readers.Count; i++) { IndexReader reader = (IndexReader) readers[i]; TermEnum termEnum = reader.Terms(); SegmentMergeInfo smi = new SegmentMergeInfo(base_Renamed, termEnum, reader); base_Renamed += reader.NumDocs(); if (smi.Next()) queue.Put(smi); // initialize queue else smi.Close(); } SegmentMergeInfo[] match = new SegmentMergeInfo[readers.Count]; while (queue.Size() > 0) { int matchSize = 0; // pop matching terms match[matchSize++] = (SegmentMergeInfo) queue.Pop(); Term term = match[0].term; SegmentMergeInfo top = (SegmentMergeInfo) queue.Top(); while (top != null && term.CompareTo(top.term) == 0) { match[matchSize++] = (SegmentMergeInfo) queue.Pop(); top = (SegmentMergeInfo) queue.Top(); } MergeTermInfo(match, matchSize); // add new TermInfo while (matchSize > 0) { SegmentMergeInfo smi = match[--matchSize]; if (smi.Next()) queue.Put(smi); // restore queue else smi.Close(); // done with a segment } } } private TermInfo termInfo = new TermInfo(); // minimize consing /// Merge one term found in one or more segments. The array smis /// contains segments that are positioned at the same term. N /// is the number of cells in the array actually occupied. /// /// /// array of segments /// /// number of cells in the array actually occupied /// private void MergeTermInfo(SegmentMergeInfo[] smis, int n) { long freqPointer = freqOutput.GetFilePointer(); long proxPointer = proxOutput.GetFilePointer(); int df = AppendPostings(smis, n); // append posting data long skipPointer = WriteSkip(); if (df > 0) { // add an entry to the dictionary with pointers to prox and freq files termInfo.Set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer)); termInfosWriter.Add(smis[0].term, termInfo); } } /// Process postings from multiple segments all positioned on the /// same term. Writes out merged entries into freqOutput and /// the proxOutput streams. /// /// /// array of segments /// /// number of cells in the array actually occupied /// /// number of documents across all segments where this term was found /// private int AppendPostings(SegmentMergeInfo[] smis, int n) { int lastDoc = 0; int df = 0; // number of docs w/ term ResetSkip(); for (int i = 0; i < n; i++) { SegmentMergeInfo smi = smis[i]; TermPositions postings = smi.postings; int base_Renamed = smi.base_Renamed; int[] docMap = smi.docMap; postings.Seek(smi.termEnum); while (postings.Next()) { int doc = postings.Doc(); if (docMap != null) doc = docMap[doc]; // map around deletions doc += base_Renamed; // convert to merged space if (doc < lastDoc) throw new System.SystemException("docs out of order"); df++; if ((df % skipInterval) == 0) { BufferSkip(lastDoc); } int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1 lastDoc = doc; int freq = postings.Freq(); if (freq == 1) { freqOutput.WriteVInt(docCode | 1); // write doc & freq=1 } else { freqOutput.WriteVInt(docCode); // write doc freqOutput.WriteVInt(freq); // write frequency in doc } int lastPosition = 0; // write position deltas for (int j = 0; j < freq; j++) { int position = postings.NextPosition(); proxOutput.WriteVInt(position - lastPosition); lastPosition = position; } } } return df; } private RAMOutputStream skipBuffer = new RAMOutputStream(); private int lastSkipDoc; private long lastSkipFreqPointer; private long lastSkipProxPointer; private void ResetSkip() { skipBuffer.Reset(); lastSkipDoc = 0; lastSkipFreqPointer = freqOutput.GetFilePointer(); lastSkipProxPointer = proxOutput.GetFilePointer(); } private void BufferSkip(int doc) { long freqPointer = freqOutput.GetFilePointer(); long proxPointer = proxOutput.GetFilePointer(); skipBuffer.WriteVInt(doc - lastSkipDoc); skipBuffer.WriteVInt((int) (freqPointer - lastSkipFreqPointer)); skipBuffer.WriteVInt((int) (proxPointer - lastSkipProxPointer)); lastSkipDoc = doc; lastSkipFreqPointer = freqPointer; lastSkipProxPointer = proxPointer; } private long WriteSkip() { long skipPointer = freqOutput.GetFilePointer(); skipBuffer.WriteTo(freqOutput); return skipPointer; } private void MergeNorms() { for (int i = 0; i < fieldInfos.Size(); i++) { FieldInfo fi = fieldInfos.FieldInfo(i); if (fi.isIndexed) { IndexOutput output = directory.CreateOutput(segment + ".f" + i); try { for (int j = 0; j < readers.Count; j++) { IndexReader reader = (IndexReader) readers[j]; int maxDoc = reader.MaxDoc(); byte[] input = new byte[maxDoc]; reader.Norms(fi.name, input, 0); for (int k = 0; k < maxDoc; k++) { if (!reader.IsDeleted(k)) { output.WriteByte(input[k]); } } } } finally { output.Close(); } } } } } }