// HtmlAgilityPack V1.0 - Simon Mourier /* Copyright (C) 2003 Simon Mourier All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ using System; using System.Diagnostics; using System.Collections.Specialized; using System.IO; using System.Text; using System.Xml; using System.Xml.XPath; namespace HtmlAgilityPack { internal class HtmlNameTable: XmlNameTable { private NameTable _nametable = new NameTable(); internal HtmlNameTable() { } internal string GetOrAdd(string array) { string s = Get(array); if (s == null) { return Add(array); } return s; } public override string Add(string array) { return _nametable.Add(array); } public override string Get(string array) { return _nametable.Get(array); } public override string Get(char[] array, int offset, int length) { return _nametable.Get(array, offset, length); } public override string Add(char[] array, int offset, int length) { return _nametable.Add(array, offset, length); } } /// /// Represents an HTML navigator on an HTML document seen as a data store. /// public class HtmlNodeNavigator: XPathNavigator { private HtmlDocument _doc = new HtmlDocument(); private HtmlNode _currentnode; private int _attindex; private HtmlNameTable _nametable = new HtmlNameTable(); internal bool Trace = false; internal HtmlNodeNavigator() { Reset(); } private void Reset() { InternalTrace(null); _currentnode = _doc.DocumentNode; _attindex = -1; } [Conditional("TRACE")] internal void InternalTrace(object Value) { if (!Trace) { return; } string name = null; StackFrame sf = new StackFrame(1, true); name = sf.GetMethod().Name; string nodename; if (_currentnode == null) { nodename = "(null)"; } else { nodename = _currentnode.Name; } string nodevalue; if (_currentnode == null) { nodevalue = "(null)"; } else { switch(_currentnode.NodeType) { case HtmlNodeType.Comment: nodevalue = ((HtmlCommentNode)_currentnode).Comment; break; case HtmlNodeType.Document: nodevalue = ""; break; case HtmlNodeType.Text: nodevalue = ((HtmlTextNode)_currentnode).Text; break; default: nodevalue = _currentnode.CloneNode(false).OuterHtml; break; } } System.Diagnostics.Trace.WriteLine("oid=" + GetHashCode() + ",n=" + nodename + ",a=" + _attindex + "," + ",v=" + nodevalue + "," + Value, "N!"+ name); } internal HtmlNodeNavigator(HtmlDocument doc, HtmlNode currentNode) { if (currentNode == null) { throw new ArgumentNullException("currentNode"); } if (currentNode.OwnerDocument != doc) { throw new ArgumentException(HtmlDocument.HtmlExceptionRefNotChild); } InternalTrace(null); _doc = doc; Reset(); _currentnode = currentNode; } private HtmlNodeNavigator(HtmlNodeNavigator nav) { if (nav == null) { throw new ArgumentNullException("nav"); } InternalTrace(null); _doc = nav._doc; _currentnode = nav._currentnode; _attindex = nav._attindex; _nametable = nav._nametable; // REVIEW: should we do this? } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a stream. /// /// The input stream. public HtmlNodeNavigator(Stream stream) { _doc.Load(stream); Reset(); } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a stream. /// /// The input stream. /// Indicates whether to look for byte order marks at the beginning of the stream. public HtmlNodeNavigator(Stream stream, bool detectEncodingFromByteOrderMarks) { _doc.Load(stream, detectEncodingFromByteOrderMarks); Reset(); } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a stream. /// /// The input stream. /// The character encoding to use. public HtmlNodeNavigator(Stream stream, Encoding encoding) { _doc.Load(stream, encoding); Reset(); } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a stream. /// /// The input stream. /// The character encoding to use. /// Indicates whether to look for byte order marks at the beginning of the stream. public HtmlNodeNavigator(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks) { _doc.Load(stream, encoding, detectEncodingFromByteOrderMarks); Reset(); } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a stream. /// /// The input stream. /// The character encoding to use. /// Indicates whether to look for byte order marks at the beginning of the stream. /// The minimum buffer size. public HtmlNodeNavigator(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize) { _doc.Load(stream, encoding, detectEncodingFromByteOrderMarks, buffersize); Reset(); } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a TextReader. /// /// The TextReader used to feed the HTML data into the document. public HtmlNodeNavigator(TextReader reader) { _doc.Load(reader); Reset(); } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a file. /// /// The complete file path to be read. public HtmlNodeNavigator(string path) { _doc.Load(path); Reset(); } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a file. /// /// The complete file path to be read. /// Indicates whether to look for byte order marks at the beginning of the file. public HtmlNodeNavigator(string path, bool detectEncodingFromByteOrderMarks) { _doc.Load(path, detectEncodingFromByteOrderMarks); Reset(); } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a file. /// /// The complete file path to be read. /// The character encoding to use. public HtmlNodeNavigator(string path, Encoding encoding) { _doc.Load(path, encoding); Reset(); } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a file. /// /// The complete file path to be read. /// The character encoding to use. /// Indicates whether to look for byte order marks at the beginning of the file. public HtmlNodeNavigator(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks) { _doc.Load(path, encoding, detectEncodingFromByteOrderMarks); Reset(); } /// /// Initializes a new instance of the HtmlNavigator and loads an HTML document from a file. /// /// The complete file path to be read. /// The character encoding to use. /// Indicates whether to look for byte order marks at the beginning of the file. /// The minimum buffer size. public HtmlNodeNavigator(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize) { _doc.Load(path, encoding, detectEncodingFromByteOrderMarks, buffersize); Reset(); } /// /// Gets the name of the current HTML node without the namespace prefix. /// public override string LocalName { get { if (_attindex != -1) { InternalTrace("att>" + _currentnode.Attributes[_attindex].Name); return _nametable.GetOrAdd(_currentnode.Attributes[_attindex].Name); } else { InternalTrace("node>" + _currentnode.Name); return _nametable.GetOrAdd(_currentnode.Name); } } } /// /// Gets the namespace URI (as defined in the W3C Namespace Specification) of the current node. /// Always returns string.Empty in the case of HtmlNavigator implementation. /// public override string NamespaceURI { get { InternalTrace(">"); return _nametable.GetOrAdd(string.Empty); } } /// /// Gets the qualified name of the current node. /// public override string Name { get { InternalTrace(">" + _currentnode.Name); return _nametable.GetOrAdd(_currentnode.Name); } } /// /// Gets the prefix associated with the current node. /// Always returns string.Empty in the case of HtmlNavigator implementation. /// public override string Prefix { get { InternalTrace(null); return _nametable.GetOrAdd(string.Empty); } } /// /// Gets the type of the current node. /// public override XPathNodeType NodeType { get { switch(_currentnode.NodeType) { case HtmlNodeType.Comment: InternalTrace(">" + XPathNodeType.Comment); return XPathNodeType.Comment; case HtmlNodeType.Document: InternalTrace(">" + XPathNodeType.Root); return XPathNodeType.Root; case HtmlNodeType.Text: InternalTrace(">" + XPathNodeType.Text); return XPathNodeType.Text; case HtmlNodeType.Element: { if (_attindex != -1) { InternalTrace(">" + XPathNodeType.Attribute); return XPathNodeType.Attribute; } InternalTrace(">" + XPathNodeType.Element); return XPathNodeType.Element; } default: throw new NotImplementedException("Internal error: Unhandled HtmlNodeType: " + _currentnode.NodeType); } } } /// /// Gets the text value of the current node. /// public override string Value { get { InternalTrace("nt=" + _currentnode.NodeType); switch(_currentnode.NodeType) { case HtmlNodeType.Comment: InternalTrace(">" + ((HtmlCommentNode)_currentnode).Comment); return ((HtmlCommentNode)_currentnode).Comment; case HtmlNodeType.Document: InternalTrace(">"); return ""; case HtmlNodeType.Text: InternalTrace(">" + ((HtmlTextNode)_currentnode).Text); return ((HtmlTextNode)_currentnode).Text; case HtmlNodeType.Element: { if (_attindex != -1) { InternalTrace(">" + _currentnode.Attributes[_attindex].Value); return _currentnode.Attributes[_attindex].Value; } return _currentnode.InnerText; } default: throw new NotImplementedException("Internal error: Unhandled HtmlNodeType: " + _currentnode.NodeType); } } } /// /// Gets the base URI for the current node. /// Always returns string.Empty in the case of HtmlNavigator implementation. /// public override string BaseURI { get { InternalTrace(">"); return _nametable.GetOrAdd(string.Empty); } } /// /// Gets the xml:lang scope for the current node. /// Always returns string.Empty in the case of HtmlNavigator implementation. /// public override string XmlLang { get { InternalTrace(null); return _nametable.GetOrAdd(string.Empty); } } /// /// Gets a value indicating whether the current node is an empty element. /// public override bool IsEmptyElement { get { InternalTrace(">" + !HasChildren); // REVIEW: is this ok? return !HasChildren; } } /// /// Gets the XmlNameTable associated with this implementation. /// public override XmlNameTable NameTable { get { InternalTrace(null); return _nametable; } } /// /// Gets a value indicating whether the current node has child nodes. /// public override bool HasAttributes { get { InternalTrace(">" + (_currentnode.Attributes.Count>0)); return (_currentnode.Attributes.Count>0); } } /// /// Gets a value indicating whether the current node has child nodes. /// public override bool HasChildren { get { InternalTrace(">" + (_currentnode.ChildNodes.Count>0)); return (_currentnode.ChildNodes.Count>0); } } /// /// Moves to the next sibling of the current node. /// /// true if the navigator is successful moving to the next sibling node, false if there are no more siblings or if the navigator is currently positioned on an attribute node. If false, the position of the navigator is unchanged. public override bool MoveToNext() { if (_currentnode.NextSibling == null) { InternalTrace(">false"); return false; } InternalTrace("_c=" + _currentnode.CloneNode(false).OuterHtml); InternalTrace("_n=" + _currentnode.NextSibling.CloneNode(false).OuterHtml); _currentnode = _currentnode.NextSibling; InternalTrace(">true"); return true; } /// /// Moves to the previous sibling of the current node. /// /// true if the navigator is successful moving to the previous sibling node, false if there is no previous sibling or if the navigator is currently positioned on an attribute node. public override bool MoveToPrevious() { if (_currentnode.PreviousSibling == null) { InternalTrace(">false"); return false; } _currentnode = _currentnode.PreviousSibling; InternalTrace(">true"); return true; } /// /// Moves to the first sibling of the current node. /// /// true if the navigator is successful moving to the first sibling node, false if there is no first sibling or if the navigator is currently positioned on an attribute node. public override bool MoveToFirst() { if (_currentnode.ParentNode == null) { InternalTrace(">false"); return false; } if (_currentnode.ParentNode.FirstChild == null) { InternalTrace(">false"); return false; } _currentnode = _currentnode.ParentNode.FirstChild; InternalTrace(">true"); return true; } /// /// Moves to the first child of the current node. /// /// true if there is a first child node, otherwise false. public override bool MoveToFirstChild() { if (!_currentnode.HasChildNodes) { InternalTrace(">false"); return false; } _currentnode = _currentnode.ChildNodes[0]; InternalTrace(">true"); return true; } /// /// Moves to the parent of the current node. /// /// true if there is a parent node, otherwise false. public override bool MoveToParent() { if (_currentnode.ParentNode == null) { InternalTrace(">false"); return false; } _currentnode = _currentnode.ParentNode; InternalTrace(">true"); return true; } /// /// Moves to the root node to which the current node belongs. /// public override void MoveToRoot() { _currentnode = _doc.DocumentNode; InternalTrace(null); } /// /// Moves to the same position as the specified HtmlNavigator. /// /// The HtmlNavigator positioned on the node that you want to move to. /// true if successful, otherwise false. If false, the position of the navigator is unchanged. public override bool MoveTo(XPathNavigator other) { HtmlNodeNavigator nav = other as HtmlNodeNavigator; if (nav == null) { InternalTrace(">false (nav is not an HtmlNodeNavigator)"); return false; } InternalTrace("moveto oid=" + nav.GetHashCode() + ", n:" + nav._currentnode.Name + ", a:" + nav._attindex); if (nav._doc == _doc) { _currentnode = nav._currentnode; _attindex = nav._attindex; InternalTrace(">true"); return true; } // we don't know how to handle that InternalTrace(">false (???)"); return false; } /// /// Moves to the node that has an attribute of type ID whose value matches the specified string. /// /// A string representing the ID value of the node to which you want to move. This argument does not need to be atomized. /// true if the move was successful, otherwise false. If false, the position of the navigator is unchanged. public override bool MoveToId(string id) { InternalTrace("id=" + id); HtmlNode node = _doc.GetElementbyId(id); if (node == null) { InternalTrace(">false"); return false; } _currentnode = node; InternalTrace(">true"); return true; } /// /// Determines whether the current HtmlNavigator is at the same position as the specified HtmlNavigator. /// /// The HtmlNavigator that you want to compare against. /// true if the two navigators have the same position, otherwise, false. public override bool IsSamePosition(XPathNavigator other) { HtmlNodeNavigator nav = other as HtmlNodeNavigator; if (nav == null) { InternalTrace(">false"); return false; } InternalTrace(">" + (nav._currentnode == _currentnode)); return (nav._currentnode == _currentnode); } /// /// Creates a new HtmlNavigator positioned at the same node as this HtmlNavigator. /// /// A new HtmlNavigator object positioned at the same node as the original HtmlNavigator. public override XPathNavigator Clone() { InternalTrace(null); return new HtmlNodeNavigator(this); } /// /// Gets the value of the HTML attribute with the specified LocalName and NamespaceURI. /// /// The local name of the HTML attribute. /// The namespace URI of the attribute. Unsupported with the HtmlNavigator implementation. /// The value of the specified HTML attribute. String.Empty or null if a matching attribute is not found or if the navigator is not positioned on an element node. public override string GetAttribute(string localName, string namespaceURI) { InternalTrace("localName=" + localName + ", namespaceURI=" + namespaceURI); HtmlAttribute att = _currentnode.Attributes[localName]; if (att == null) { InternalTrace(">null"); return null; } InternalTrace(">" + att.Value); return att.Value; } /// /// Moves to the HTML attribute with matching LocalName and NamespaceURI. /// /// The local name of the HTML attribute. /// The namespace URI of the attribute. Unsupported with the HtmlNavigator implementation. /// true if the HTML attribute is found, otherwise, false. If false, the position of the navigator does not change. public override bool MoveToAttribute(string localName, string namespaceURI) { InternalTrace("localName=" + localName + ", namespaceURI=" + namespaceURI); int index = _currentnode.Attributes.GetAttributeIndex(localName); if (index == -1) { InternalTrace(">false"); return false; } _attindex = index; InternalTrace(">true"); return true; } /// /// Moves to the first HTML attribute. /// /// true if the navigator is successful moving to the first HTML attribute, otherwise, false. public override bool MoveToFirstAttribute() { if (!HasAttributes) { InternalTrace(">false"); return false; } _attindex = 0; InternalTrace(">true"); return true; } /// /// Moves to the next HTML attribute. /// /// public override bool MoveToNextAttribute() { InternalTrace(null); if (_attindex>=(_currentnode.Attributes.Count-1)) { InternalTrace(">false"); return false; } _attindex++; InternalTrace(">true"); return true; } /// /// Returns the value of the namespace node corresponding to the specified local name. /// Always returns string.Empty for the HtmlNavigator implementation. /// /// The local name of the namespace node. /// Always returns string.Empty for the HtmlNavigator implementation. public override string GetNamespace(string name) { InternalTrace("name=" + name); return string.Empty; } /// /// Moves the XPathNavigator to the namespace node with the specified local name. /// Always returns false for the HtmlNavigator implementation. /// /// The local name of the namespace node. /// Always returns false for the HtmlNavigator implementation. public override bool MoveToNamespace(string name) { InternalTrace("name=" + name); return false; } /// /// Moves the XPathNavigator to the first namespace node of the current element. /// Always returns false for the HtmlNavigator implementation. /// /// An XPathNamespaceScope value describing the namespace scope. /// Always returns false for the HtmlNavigator implementation. public override bool MoveToFirstNamespace(XPathNamespaceScope scope) { InternalTrace(null); return false; } /// /// Moves the XPathNavigator to the next namespace node. /// Always returns falsefor the HtmlNavigator implementation. /// /// An XPathNamespaceScope value describing the namespace scope. /// Always returns false for the HtmlNavigator implementation. public override bool MoveToNextNamespace(XPathNamespaceScope scope) { InternalTrace(null); return false; } /// /// Gets the current HTML node. /// public HtmlNode CurrentNode { get { return _currentnode; } } /// /// Gets the current HTML document. /// public HtmlDocument CurrentDocument { get { return _doc; } } } }