/* * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using IndexReader = Lucene.Net.Index.IndexReader; using Hits = Lucene.Net.Search.Hits; using Similarity = Lucene.Net.Search.Similarity; using Parameter = Lucene.Net.Util.Parameter; namespace Lucene.Net.Documents { /// A field is a section of a Document. Each field has two parts, a name and a /// value. Values may be free text, provided as a String or as a Reader, or they /// may be atomic keywords, which are not further processed. Such keywords may /// be used to represent dates, urls, etc. Fields are optionally stored in the /// index, so that they may be returned with hits on the document. /// [Serializable] public sealed class Field { private System.String name = "body"; // the one and only data object for all different kind of field values private System.Object fieldsData = null; private bool storeTermVector = false; private bool storeOffsetWithTermVector = false; private bool storePositionWithTermVector = false; private bool isStored = false; private bool isIndexed = true; private bool isTokenized = true; private bool isBinary = false; private bool isCompressed = false; private float boost = 1.0f; [Serializable] public sealed class Store : Parameter { internal Store(System.String name) : base(name) { } /// Store the original field value in the index in a compressed form. This is /// useful for long documents and for binary valued fields. /// public static readonly Store COMPRESS = new Store("COMPRESS"); /// Store the original field value in the index. This is useful for short texts /// like a document's title which should be displayed with the results. The /// value is stored in its original form, i.e. no analyzer is used before it is /// stored. /// public static readonly Store YES = new Store("YES"); /// Do not store the field value in the index. public static readonly Store NO = new Store("NO"); } [Serializable] public sealed class Index : Parameter { internal Index(System.String name) : base(name) { } /// Do not index the field value. This field can thus not be searched, /// but one can still access its contents provided it is /// {@link Field.Store stored}. /// public static readonly Index NO = new Index("NO"); /// Index the field's value so it can be searched. An Analyzer will be used /// to tokenize and possibly further normalize the text before its /// terms will be stored in the index. This is useful for common text. /// public static readonly Index TOKENIZED = new Index("TOKENIZED"); /// Index the field's value without using an Analyzer, so it can be searched. /// As no analyzer is used the value will be stored as a single term. This is /// useful for unique Ids like product numbers. /// public static readonly Index UN_TOKENIZED = new Index("UN_TOKENIZED"); } [Serializable] public sealed class TermVector : Parameter { internal TermVector(System.String name) : base(name) { } /// Do not store term vectors. public static readonly TermVector NO = new TermVector("NO"); /// Store the term vectors of each document. A term vector is a list /// of the document's terms and their number of occurences in that document. /// public static readonly TermVector YES = new TermVector("YES"); /// Store the term vector + token position information /// /// /// /// public static readonly TermVector WITH_POSITIONS = new TermVector("WITH_POSITIONS"); /// Store the term vector + Token offset information /// /// /// /// public static readonly TermVector WITH_OFFSETS = new TermVector("WITH_OFFSETS"); /// Store the term vector + Token position and offset information /// /// /// /// /// /// /// /// public static readonly TermVector WITH_POSITIONS_OFFSETS = new TermVector("WITH_POSITIONS_OFFSETS"); } /// Sets the boost factor hits on this field. This value will be /// multiplied into the score of all hits on this this field of this /// document. /// ///

The boost is multiplied by {@link Document#GetBoost()} of the document /// containing this field. If a document has multiple fields with the same /// name, all such values are multiplied together. This product is then /// multipled by the value {@link Similarity#LengthNorm(String,int)}, and /// rounded by {@link Similarity#EncodeNorm(float)} before it is stored in the /// index. One should attempt to ensure that this product does not overflow /// the range of that encoding. /// ///

/// /// /// /// /// /// public void SetBoost(float boost) { this.boost = boost; } /// Returns the boost factor for hits for this field. /// ///

The default value is 1.0. /// ///

Note: this value is not stored directly with the document in the index. /// Documents returned from {@link IndexReader#Document(int)} and /// {@link Hits#Doc(int)} may thus not have the same value present as when /// this field was indexed. /// ///

/// /// public float GetBoost() { return boost; } /// Constructs a String-valued Field that is not tokenized, but is indexed /// and stored. Useful for non-text fields, e.g. date or url. /// /// use {@link #Field(String, String, Field.Store, Field.Index) /// Field(name, value, Field.Store.YES, Field.Index.UN_TOKENIZED)} instead /// public static Field Keyword(System.String name, System.String value_Renamed) { return new Field(name, value_Renamed, true, true, false); } /// Constructs a String-valued Field that is not tokenized nor indexed, /// but is stored in the index, for return with hits. /// /// use {@link #Field(String, String, Field.Store, Field.Index) /// Field(name, value, Field.Store.YES, Field.Index.NO)} instead /// public static Field UnIndexed(System.String name, System.String value_Renamed) { return new Field(name, value_Renamed, true, false, false); } /// Constructs a String-valued Field that is tokenized and indexed, /// and is stored in the index, for return with hits. Useful for short text /// fields, like "title" or "subject". Term vector will not be stored for this field. /// /// use {@link #Field(String, String, Field.Store, Field.Index) /// Field(name, value, Field.Store.YES, Field.Index.TOKENIZED)} instead /// public static Field Text(System.String name, System.String value_Renamed) { return Text(name, value_Renamed, false); } /// Constructs a Date-valued Field that is not tokenized and is indexed, /// and stored in the index, for return with hits. /// /// use {@link #Field(String, String, Field.Store, Field.Index) /// Field(name, value, Field.Store.YES, Field.Index.UN_TOKENIZED)} instead /// public static Field Keyword(System.String name, System.DateTime value_Renamed) { return new Field(name, DateField.DateToString(value_Renamed), true, true, false); } /// Constructs a String-valued Field that is tokenized and indexed, /// and is stored in the index, for return with hits. Useful for short text /// fields, like "title" or "subject". /// /// use {@link #Field(String, String, Field.Store, Field.Index, Field.TermVector) /// Field(name, value, Field.Store.YES, Field.Index.TOKENIZED, storeTermVector)} instead /// public static Field Text(System.String name, System.String value_Renamed, bool storeTermVector) { return new Field(name, value_Renamed, true, true, true, storeTermVector); } /// Constructs a String-valued Field that is tokenized and indexed, /// but that is not stored in the index. Term vector will not be stored for this field. /// /// use {@link #Field(String, String, Field.Store, Field.Index) /// Field(name, value, Field.Store.NO, Field.Index.TOKENIZED)} instead /// public static Field UnStored(System.String name, System.String value_Renamed) { return UnStored(name, value_Renamed, false); } /// Constructs a String-valued Field that is tokenized and indexed, /// but that is not stored in the index. /// /// use {@link #Field(String, String, Field.Store, Field.Index, Field.TermVector) /// Field(name, value, Field.Store.NO, Field.Index.TOKENIZED, storeTermVector)} instead /// public static Field UnStored(System.String name, System.String value_Renamed, bool storeTermVector) { return new Field(name, value_Renamed, false, true, true, storeTermVector); } /// Constructs a Reader-valued Field that is tokenized and indexed, but is /// not stored in the index verbatim. Useful for longer text fields, like /// "body". Term vector will not be stored for this field. /// /// use {@link #Field(String, Reader) Field(name, value)} instead /// public static Field Text(System.String name, System.IO.TextReader value_Renamed) { return Text(name, value_Renamed, false); } /// Constructs a Reader-valued Field that is tokenized and indexed, but is /// not stored in the index verbatim. Useful for longer text fields, like /// "body". /// /// use {@link #Field(String, Reader, Field.TermVector) /// Field(name, value, storeTermVector)} instead /// public static Field Text(System.String name, System.IO.TextReader value_Renamed, bool storeTermVector) { Field f = new Field(name, value_Renamed); f.storeTermVector = storeTermVector; return f; } /// Returns the name of the field as an interned string. /// For example "date", "title", "body", ... /// public System.String Name() { return name; } /// The value of the field as a String, or null. If null, the Reader value /// or binary value is used. Exactly one of stringValue(), readerValue(), and /// binaryValue() must be set. /// public System.String StringValue() { return fieldsData as System.String; } /// The value of the field as a Reader, or null. If null, the String value /// or binary value is used. Exactly one of stringValue(), readerValue(), /// and binaryValue() must be set. /// public System.IO.TextReader ReaderValue() { return fieldsData as System.IO.TextReader; } /// The value of the field in Binary, or null. If null, the Reader or /// String value is used. Exactly one of stringValue(), readerValue() and /// binaryValue() must be set. /// public byte[] BinaryValue() { return fieldsData as byte[]; } /// Create a field by specifying its name, value and how it will /// be saved in the index. Term vectors will not be stored in the index. /// /// /// The name of the field /// /// The string to process /// /// Whether value should be stored in the index /// /// Whether the field should be indexed, and if so, if it should /// be tokenized before indexing /// /// NullPointerException if name or value is null /// IllegalArgumentException if the field is neither stored nor indexed public Field(System.String name, System.String value_Renamed, Store store, Index index) : this(name, value_Renamed, store, index, TermVector.NO) { } /// Create a field by specifying its name, value and how it will /// be saved in the index. /// /// /// The name of the field /// /// The string to process /// /// Whether value should be stored in the index /// /// Whether the field should be indexed, and if so, if it should /// be tokenized before indexing /// /// Whether term vector should be stored /// /// NullPointerException if name or value is null /// IllegalArgumentException in any of the following situations: /// /// public Field(System.String name, System.String value_Renamed, Store store, Index index, TermVector termVector) { if (name == null) throw new System.NullReferenceException("name cannot be null"); if (value_Renamed == null) throw new System.NullReferenceException("value cannot be null"); if (index == Index.NO && store == Store.NO) throw new System.ArgumentException("it doesn't make sense to have a field that " + "is neither indexed nor stored"); if (index == Index.NO && termVector != TermVector.NO) throw new System.ArgumentException("cannot store term vector information " + "for a field that is not indexed"); this.name = String.Intern(name); // field names are interned this.fieldsData = value_Renamed; if (store == Store.YES) { this.isStored = true; this.isCompressed = false; } else if (store == Store.COMPRESS) { this.isStored = true; this.isCompressed = true; } else if (store == Store.NO) { this.isStored = false; this.isCompressed = false; } else { throw new System.ArgumentException("unknown store parameter " + store); } if (index == Index.NO) { this.isIndexed = false; this.isTokenized = false; } else if (index == Index.TOKENIZED) { this.isIndexed = true; this.isTokenized = true; } else if (index == Index.UN_TOKENIZED) { this.isIndexed = true; this.isTokenized = false; } else { throw new System.ArgumentException("unknown index parameter " + index); } this.isBinary = false; SetStoreTermVector(termVector); } /// Create a tokenized and indexed field that is not stored. Term vectors will /// not be stored. /// /// /// The name of the field /// /// The reader with the content /// /// NullPointerException if name or reader is null public Field(System.String name, System.IO.TextReader reader) : this(name, reader, TermVector.NO) { } /// Create a tokenized and indexed field that is not stored, optionally with /// storing term vectors. /// /// /// The name of the field /// /// The reader with the content /// /// Whether term vector should be stored /// /// NullPointerException if name or reader is null public Field(System.String name, System.IO.TextReader reader, TermVector termVector) { if (name == null) throw new System.NullReferenceException("name cannot be null"); if (reader == null) throw new System.NullReferenceException("reader cannot be null"); this.name = String.Intern(name); // field names are interned this.fieldsData = reader; this.isStored = false; this.isCompressed = false; this.isIndexed = true; this.isTokenized = true; this.isBinary = false; SetStoreTermVector(termVector); } /// Create a field by specifying all parameters except for storeTermVector, /// which is set to false. /// /// /// use {@link #Field(String, String, Field.Store, Field.Index)} instead /// public Field(System.String name, System.String string_Renamed, bool store, bool index, bool token) : this(name, string_Renamed, store, index, token, false) { } /// Create a stored field with binary value. Optionally the value may be compressed. /// /// /// The name of the field /// /// The binary value /// /// How value should be stored (compressed or not.) /// public Field(System.String name, byte[] value_Renamed, Store store) { if (name == null) throw new System.ArgumentException("name cannot be null"); if (value_Renamed == null) throw new System.ArgumentException("value cannot be null"); this.name = String.Intern(name); this.fieldsData = value_Renamed; if (store == Store.YES) { this.isStored = true; this.isCompressed = false; } else if (store == Store.COMPRESS) { this.isStored = true; this.isCompressed = true; } else if (store == Store.NO) throw new System.ArgumentException("binary values can't be unstored"); else { throw new System.ArgumentException("unknown store parameter " + store); } this.isIndexed = false; this.isTokenized = false; this.isBinary = true; SetStoreTermVector(TermVector.NO); } /// /// The name of the field /// /// The string to process /// /// true if the field should store the string /// /// true if the field should be indexed /// /// true if the field should be tokenized /// /// true if we should store the Term Vector info /// /// /// use {@link #Field(String, String, Field.Store, Field.Index, Field.TermVector)} instead /// public Field(System.String name, System.String string_Renamed, bool store, bool index, bool token, bool storeTermVector) { if (name == null) throw new System.NullReferenceException("name cannot be null"); if (string_Renamed == null) throw new System.NullReferenceException("value cannot be null"); if (!index && storeTermVector) throw new System.ArgumentException("cannot store a term vector for fields that are not indexed"); this.name = String.Intern(name); // field names are interned this.fieldsData = string_Renamed; this.isStored = store; this.isIndexed = index; this.isTokenized = token; this.storeTermVector = storeTermVector; } private void SetStoreTermVector(TermVector termVector) { if (termVector == TermVector.NO) { this.storeTermVector = false; this.storePositionWithTermVector = false; this.storeOffsetWithTermVector = false; } else if (termVector == TermVector.YES) { this.storeTermVector = true; this.storePositionWithTermVector = false; this.storeOffsetWithTermVector = false; } else if (termVector == TermVector.WITH_POSITIONS) { this.storeTermVector = true; this.storePositionWithTermVector = true; this.storeOffsetWithTermVector = false; } else if (termVector == TermVector.WITH_OFFSETS) { this.storeTermVector = true; this.storePositionWithTermVector = false; this.storeOffsetWithTermVector = true; } else if (termVector == TermVector.WITH_POSITIONS_OFFSETS) { this.storeTermVector = true; this.storePositionWithTermVector = true; this.storeOffsetWithTermVector = true; } else { throw new System.ArgumentException("unknown termVector parameter " + termVector); } } /// True iff the value of the field is to be stored in the index for return /// with search hits. It is an error for this to be true if a field is /// Reader-valued. /// public bool IsStored() { return isStored; } /// True iff the value of the field is to be indexed, so that it may be /// searched on. /// public bool IsIndexed() { return isIndexed; } /// True iff the value of the field should be tokenized as text prior to /// indexing. Un-tokenized fields are indexed as a single word and may not be /// Reader-valued. /// public bool IsTokenized() { return isTokenized; } /// True if the value of the field is stored and compressed within the index public bool IsCompressed() { return isCompressed; } /// True iff the term or terms used to index this field are stored as a term /// vector, available from {@link IndexReader#GetTermFreqVector(int,String)}. /// These methods do not provide access to the original content of the field, /// only to terms used to index it. If the original content must be /// preserved, use the stored attribute instead. /// /// /// /// public bool IsTermVectorStored() { return storeTermVector; } /// True iff terms are stored as term vector together with their offsets /// (start and end positon in source text). /// public bool IsStoreOffsetWithTermVector() { return storeOffsetWithTermVector; } /// True iff terms are stored as term vector together with their token positions. public bool IsStorePositionWithTermVector() { return storePositionWithTermVector; } /// True iff the value of the filed is stored as binary public bool IsBinary() { return isBinary; } /// Prints a Field for human consumption. public override System.String ToString() { System.Text.StringBuilder result = new System.Text.StringBuilder(); if (isStored) { result.Append("stored"); if (isCompressed) result.Append("/compressed"); else result.Append("/uncompressed"); } if (isIndexed) { if (result.Length > 0) result.Append(","); result.Append("indexed"); } if (isTokenized) { if (result.Length > 0) result.Append(","); result.Append("tokenized"); } if (storeTermVector) { if (result.Length > 0) result.Append(","); result.Append("termVector"); } if (storeOffsetWithTermVector) { if (result.Length > 0) result.Append(","); result.Append("termVectorOffsets"); } if (storePositionWithTermVector) { if (result.Length > 0) result.Append(","); result.Append("termVectorPosition"); } if (isBinary) { if (result.Length > 0) result.Append(","); result.Append("binary"); } result.Append('<'); result.Append(name); result.Append(':'); if (fieldsData != null) { result.Append(fieldsData); } result.Append('>'); return result.ToString(); } } }