// HtmlAgilityPack V1.0 - Simon Mourier /* Copyright (C) 2003 Simon Mourier All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ using System; using System.IO; using System.Text; using System.Diagnostics; using System.Collections; using System.Text.RegularExpressions; using System.Xml; using System.Xml.XPath; // Legend: SLIM=Comment added describing changes to original HtmlAgilityPack // to reduce memory consumption // Once the parser is free of bugs, the comments will be taken out namespace HtmlAgilityPack { ///

/// Represents the type of parsing error. ///

public enum HtmlParseErrorCode { ///

/// A tag was not closed. ///

TagNotClosed, ///

/// A tag was not opened. ///

TagNotOpened, ///

/// There is a charset mismatch between stream and declared (META) encoding. ///

CharsetMismatch, ///

/// An end tag was not required. ///

EndTagNotRequired, ///

/// An end tag is invalid at this position. ///

EndTagInvalidHere } ///

/// Represents a parsing error found during document parsing. ///

public class HtmlParseError { private HtmlParseErrorCode _code; private int _line; private int _linePosition; private int _streamPosition; private string _sourceText; private string _reason; internal HtmlParseError( HtmlParseErrorCode code, int line, int linePosition, int streamPosition, string sourceText, string reason) { _code = code; _line = line; _linePosition = linePosition; _streamPosition = streamPosition; _sourceText = sourceText; _reason = reason; } ///

/// Gets the type of error. ///

public HtmlParseErrorCode Code { get { return _code; } } ///

/// Gets the line number of this error in the document. ///

public int Line { get { return _line; } } ///

/// Gets the column number of this error in the document. ///

public int LinePosition { get { return _linePosition; } } ///

/// Gets the absolstream position of this error in the document, relative to the start of the document. ///

public int StreamPosition { get { return _streamPosition; } } ///

/// Gets the the full text of the line containing the error. ///

public string SourceText { get { return _sourceText; } } ///

/// Gets a description for the error. ///

public string Reason { get { return _reason; } } } abstract class StreamAsArray { public abstract bool Eof (int index); public abstract char this [int index] { get;} public abstract string Substring (int startindex, int length); public abstract int FullLength { get;} } // SLIM: creating this class to wrap around a textreader // to emulate ReadToEnd () behaviour class ImplStreamAsArray : StreamAsArray { private StreamReader _reader; private int _length; private int _position; private bool _eof; private char[] _buf_previous; // could have used only one array private char[] _buf_current; // but, this is cleaner private int _block_size; public ImplStreamAsArray (StreamReader r) { _reader = r; _length = 0; _position = 0; _eof = false; _block_size = 1024; _buf_previous = new char [_block_size]; _buf_current = new char [_block_size]; Read (true); } private void Read (bool initial) { if ( !initial) { Array.Copy (_buf_current, _buf_previous, _block_size); _position += _block_size; } HtmlDocument.Debug ("Debug: Read in buffer at:" + _position); int num_read = _reader.Read (_buf_current, 0, _block_size); if (num_read < _block_size) { _eof = true; _length = _position + num_read; } HtmlDocument.Debug ("[" + new string (_buf_current, 0, num_read) + "]"); } public override bool Eof (int index) { if (_eof) return (index == _length); else { if (index >= _position + _block_size && index < _position + _block_size + _block_size) Read (false); if (_eof) return (index == _length); else return false; } } public override char this[int index] { get { if (index >= _position && index < _position + _block_size) return _buf_current [index % _block_size]; if (index >= _position - _block_size && index < _position) return _buf_previous [ index % _block_size]; if (index >= _position + _block_size && index < _position + _block_size + _block_size) { Read (false); return _buf_current [index % _block_size]; } Console.WriteLine ("EXCEPTION!!!"); throw new Exception (String.Format ("{0} is out of current bounds:[{1}-{2}] and further than read-ahead", index, _position - _block_size, _position + _block_size - 1)); } } // evil function ... you get what you pay for! private string OutOfBandRead (int startindex, int length) { HtmlDocument.Debug ("Out of band read! From " + startindex + " to " + (startindex + length - 1)); ResetPosition (startindex); // ahh.. now we are at the correct place // create a buffer of required length // who cares if the buffer size does not align well // with page boundary char[] temp_buf = new char [length]; int num_read = _reader.Read (temp_buf, 0, length); if (num_read < length) { // Shouldnt occur!!! _eof = true; _length = startindex + num_read; } // discard data and reset stream position int t = (_eof ? _length :_position + _block_size); ResetPosition (t); return new String (temp_buf); } // streamreader does not allow seeking // seek on its basestream does not reflect the position // of the reader - it is governed by the buffer size // of the underlying stream // :( so, read character by character from beginning ... private void ResetPosition (int pos) { _reader.DiscardBufferedData (); _reader.BaseStream.Position = 0; // read in chunks of block_size int n1 = pos / _block_size; int n2 = pos % _block_size; char[] tmp = new char [_block_size]; // yo ho... start reading till we have reach pos // hopefully, reader will buffer itself, so we can be mean and get one char at a time for (int i = 0; i < n1; ++i) _reader.Read (tmp, 0, _block_size); for (int i = 0; i < n2; ++i) _reader.Read (); tmp = null; } public override string Substring (int startindex, int length) { if (length == 0) { HtmlDocument.Debug ("substring:" + startindex + " " + length + " " + _position + ":"); return String.Empty; } if (length > _block_size || startindex < _position - _block_size) { return OutOfBandRead (startindex, length); } if (startindex + length - 1 >= _position + _block_size) { Read (false); } string substr; if (startindex < _position) { int len_1 = _position - startindex; if (length < len_1) substr = new String (_buf_previous, _block_size - len_1, length); else { substr = new String (_buf_previous, _block_size - len_1, len_1); substr += new String (_buf_current, 0, length - len_1); } } else { substr = new String (_buf_current, startindex - _position, length); } return substr; } // FIXME: Is this costly ? public override int FullLength { get { return (int)_reader.BaseStream.Length; } } } // A dummy StreamAsArray wrapper around a string class DummyStreamAsArray : StreamAsArray { private string _base_string; private int _length; public DummyStreamAsArray(string str) { _base_string = str; _length = str.Length; } public override bool Eof(int index) { return (index >= _length); } public new char this[int index] { get { return _base_string [index]; } } public override string Substring (int startindex, int length) { return _base_string.Substring (startindex, length); } public override int FullLength { get { return _length; } } } ///

/// Represents a complete HTML document. ///

public class HtmlDocument: IXPathNavigable { // SLIM: Make the parser event driven // callback for FilterHtml // return value is a way for the callback to signal to continue or stop parsing public delegate bool NodeHandler (HtmlNode node); public NodeHandler ReportNode; // misnomer ... should be called event_driven_mode private bool _streammode = false; private bool _stop_parsing = false; internal static readonly string HtmlExceptionRefNotChild = "Reference node must be a child of this node"; internal static readonly string HtmlExceptionUseIdAttributeFalse = "You need to set UseIdAttribute property to true to enable this feature"; internal Hashtable _openednodes; internal Hashtable _lastnodes = new Hashtable(); internal Hashtable _nodesid; private HtmlNode _documentnode; //SLIM: internal string _text; internal StreamAsArray _text; private HtmlNode _currentnode; private HtmlNode _lastparentnode; private HtmlAttribute _currentattribute; private int _index; private int _line; private int _lineposition, _maxlineposition; private int _c; private bool _fullcomment; private System.Text.Encoding _streamencoding; private System.Text.Encoding _declaredencoding; private ArrayList _parseerrors = new ArrayList(); private ParseState _state, _oldstate; private Crc32 _crc32 = null; private bool _onlyDetectEncoding = false; private int _pcdata_quote_char = '\0'; private static bool _debug = false; internal static void Debug (string s) { if (_debug) Console.WriteLine (s); } // public props ///

/// Defines if a checksum must be computed for the document while parsing. Default is false. ///

public bool OptionComputeChecksum = false; ///

/// Defines if declared encoding must be read from the document. /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node. /// Default is true. ///

public bool OptionReadEncoding = true; ///

/// Defines if non closed nodes will be checked at the end of parsing. Default is true. ///

public bool OptionCheckSyntax = true; ///

/// Defines if the 'id' attribute must be specifically used. Default is true. ///

public bool OptionUseIdAttribute = true; ///

/// Defines if empty nodes must be written as closed during output. Default is false. ///

public bool OptionWriteEmptyNodes = false; ///

/// Defines if output must conform to XML, instead of HTML. ///

public bool OptionOutputAsXml = false; ///

/// Defines if name must be output in uppercase. Default is false. ///

public bool OptionOutputUpperCase = false; ///

/// Defines if attribute value output must be optimized (not bound with double quotes if it is possible). Default is false. ///

public bool OptionOutputOptimizeAttributeValues = false; ///

/// Adds Debugging attributes to node. Default is false. ///

public bool OptionAddDebuggingAttributes = false; ///

/// Defines if source text must be extracted while parsing errors. /// If the document has a lot of errors, or cascading errors, parsing performance can be dramatically affected if set to true. /// Default is false. ///

public bool OptionExtractErrorSourceText = false; // turning this on can dramatically slow performance if a lot of errors are detected ///

/// Defines if closing for non closed nodes must be done at the end or directly in the document. /// Setting this to true can actually change how browsers render the page. Default is false. ///

public bool OptionAutoCloseOnEnd = false; // close errors at the end ///

/// Defines if LI, TR, TH, TD tags must be partially fixed when nesting errors are detected. Default is false. ///

public bool OptionFixNestedTags = false; // fix li, tr, th, td tags ///

/// Defines the maximum length of source text or parse errors. Default is 100. ///

public int OptionExtractErrorSourceTextMaxLength = 100; ///

/// Defines the default stream encoding to use. Default is System.Text.Encoding.Default. ///

// From http://www.w3.org/TR/REC-html40/charset.html // The HTTP protocol ([RFC2616], section 3.7.1) mentions ISO-8859-1 as a default character encoding when the "charset" parameter is absent from the "Content-Type" header field. // So, however we are still using UTF-8 for some unknown reason //FIXME: Fix the default encoding! public System.Text.Encoding OptionDefaultStreamEncoding = Encoding.UTF8; ///

/// Gets a list of parse errors found in the document. ///

public ArrayList ParseErrors { get { return _parseerrors; } } ///

/// Gets the document's stream encoding. ///

public System.Text.Encoding StreamEncoding { get { return _streamencoding; } } ///

/// Gets the document's declared encoding. /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node. ///

public System.Text.Encoding DeclaredEncoding { get { return _declaredencoding; } } ///

/// Creates an instance of an HTML document. ///

public HtmlDocument() { _documentnode = CreateNode(HtmlNodeType.Document, 0); } internal HtmlNode GetXmlDeclaration() { if (!_documentnode.HasChildNodes) { return null; } foreach(HtmlNode node in _documentnode._childnodes) { if (node.Name == "?xml") // it's ok, names are case sensitive { return node; } } return null; } ///

/// Applies HTML encoding to a specified string. ///

/// The input string to encode. May not be null. /// The encoded string. public static string HtmlEncode(string html) { if (html == null) { throw new ArgumentNullException("html"); } // replace & by & but only once! Regex rx = new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase); return rx.Replace(html, "&").Replace("<", "<").Replace(">", ">").Replace("\"", """); } ///

/// Detects the encoding of an HTML stream. ///

/// The input stream. May not be null. /// The detected encoding. public Encoding DetectEncoding(Stream stream) { if (stream == null) { throw new ArgumentNullException("stream"); } return DetectEncoding(new StreamReader(stream)); } ///

/// Detects the encoding of an HTML file. ///

/// Path for the file containing the HTML document to detect. May not be null. /// The detected encoding. public Encoding DetectEncoding(string path) { if (path == null) { throw new ArgumentNullException("path"); } StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding); Encoding encoding = DetectEncoding(sr); sr.Close(); return encoding; } ///

/// Detects the encoding of an HTML text. ///

/// The input html text. May not be null. /// The detected encoding. public Encoding DetectEncodingHtml(string html) { if (html == null) { throw new ArgumentNullException("html"); } StringReader sr = new StringReader(html); Encoding encoding = DetectEncoding(sr); sr.Close(); return encoding; } ///

/// Detects the encoding of an HTML text provided on a TextReader. ///

/// The TextReader used to feed the HTML. May not be null. /// The detected encoding. public Encoding DetectEncoding(TextReader reader) { if (reader == null) { throw new ArgumentNullException("reader"); } _onlyDetectEncoding = true; if (OptionCheckSyntax) { _openednodes = new Hashtable(); } else { _openednodes = null; } if (OptionUseIdAttribute) { _nodesid = new Hashtable(); } else { _nodesid = null; } StreamReader sr = reader as StreamReader; if (sr != null) { _streamencoding = sr.CurrentEncoding; _text = new ImplStreamAsArray (sr); } else { _streamencoding = null; // Expensive, but cannot avoid since TextReader doesnt have any length of the underlying data _text = new DummyStreamAsArray (reader.ReadToEnd()); } _declaredencoding = null; // SLIM: _text = reader.ReadToEnd(); _documentnode = CreateNode(HtmlNodeType.Document, 0); // this is a hack, but it allows us not to muck with the original parsing code try { Parse(); } catch(EncodingFoundException ex) { _lastnodes.Clear(); return ex.Encoding; } return null; } ///

/// Loads an HTML document from a stream. ///

/// The input stream. public void Load(Stream stream) { Load(new StreamReader(stream, OptionDefaultStreamEncoding)); } ///

/// Loads an HTML document from a stream. ///

/// The input stream. /// Indicates whether to look for byte order marks at the beginning of the stream. public void Load(Stream stream, bool detectEncodingFromByteOrderMarks) { Load(new StreamReader(stream, detectEncodingFromByteOrderMarks)); } ///

/// Loads an HTML document from a stream. ///

/// The input stream. /// The character encoding to use. public void Load(Stream stream, Encoding encoding) { Load(new StreamReader(stream, encoding)); } ///

/// Loads an HTML document from a stream. ///

/// The input stream. /// The character encoding to use. /// Indicates whether to look for byte order marks at the beginning of the stream. public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks) { Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks)); } ///

/// Loads an HTML document from a stream. ///

/// The input stream. /// The character encoding to use. /// Indicates whether to look for byte order marks at the beginning of the stream. /// The minimum buffer size. public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize) { Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks, buffersize)); } ///