// HtmlAgilityPack V1.0
/*
Copyright (C) 2003 Simon Mourier
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. The name of the author may not be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
using System;
using System.IO;
using System.Net;
using System.Xml;
using System.Xml.Serialization;
using System.Xml.Xsl;
using Microsoft.Win32;
namespace HtmlAgilityPack
{
///
/// A utility class to get HTML document from HTTP.
///
public class HtmlWeb
{
///
/// Represents the method that will handle the PreRequest event.
///
public delegate bool PreRequestHandler(HttpWebRequest request);
///
/// Represents the method that will handle the PostResponse event.
///
public delegate void PostResponseHandler(HttpWebRequest request, HttpWebResponse response);
///
/// Represents the method that will handle the PreHandleDocument event.
///
public delegate void PreHandleDocumentHandler(HtmlDocument document);
private int _streamBufferSize = 1024;
private string _cachePath;
private bool _usingCache;
private bool _fromCache;
private bool _cacheOnly;
private bool _useCookies;
private int _requestDuration;
private bool _autoDetectEncoding = true;
private HttpStatusCode _statusCode = HttpStatusCode.OK;
private Uri _responseUri;
///
/// Occurs before an HTTP request is executed.
///
public PreRequestHandler PreRequest;
///
/// Occurs after an HTTP request has been executed.
///
public PostResponseHandler PostResponse;
///
/// Occurs before an HTML document is handled.
///
public PreHandleDocumentHandler PreHandleDocument;
///
/// Creates an instance of an HtmlWeb class.
///
public HtmlWeb()
{
}
///
/// Gets an HTML document from an Internet resource and saves it to the specified file.
///
/// The requested URL, such as "http://Myserver/Mypath/Myfile.asp".
/// The location of the file where you want to save the document.
public void Get(string url, string path)
{
Get(url, path, "GET");
}
///
/// Gets an HTML document from an Internet resource and saves it to the specified file.
///
/// The requested URL, such as "http://Myserver/Mypath/Myfile.asp".
/// The location of the file where you want to save the document.
/// The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND.
public void Get(string url, string path, string method)
{
Uri uri = new Uri(url);
if ((uri.Scheme == Uri.UriSchemeHttps) ||
(uri.Scheme == Uri.UriSchemeHttp))
{
Get(uri, method, path, null);
}
else
{
throw new HtmlWebException("Unsupported uri scheme: '" + uri.Scheme + "'.");
}
}
///
/// Gets an HTML document from an Internet resource.
///
/// The requested URL, such as "http://Myserver/Mypath/Myfile.asp".
/// A new HTML document.
public HtmlDocument Load(string url)
{
return Load(url, "GET");
}
///
/// Loads an HTML document from an Internet resource.
///
/// The requested URL, such as "http://Myserver/Mypath/Myfile.asp".
/// The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND.
/// A new HTML document.
public HtmlDocument Load(string url, string method)
{
Uri uri = new Uri(url);
HtmlDocument doc;
if ((uri.Scheme == Uri.UriSchemeHttps) ||
(uri.Scheme == Uri.UriSchemeHttp))
{
doc = LoadUrl(uri, method);
}
else
{
if (uri.Scheme == Uri.UriSchemeFile)
{
doc = new HtmlDocument();
doc.OptionAutoCloseOnEnd = false;
doc.OptionAutoCloseOnEnd = true;
doc.DetectEncodingAndLoad(url, _autoDetectEncoding);
}
else
{
throw new HtmlWebException("Unsupported uri scheme: '" + uri.Scheme + "'.");
}
}
if (PreHandleDocument != null)
{
PreHandleDocument(doc);
}
return doc;
}
private bool IsCacheHtmlContent(string path)
{
string ct = GetContentTypeForExtension(Path.GetExtension(path), null);
return IsHtmlContent(ct);
}
private bool IsHtmlContent(string contentType)
{
return contentType.ToLower().StartsWith("text/html");
}
private string GetCacheHeadersPath(Uri uri)
{
//return Path.Combine(GetCachePath(uri), ".h.xml");
return GetCachePath(uri) + ".h.xml";
}
///
/// Gets the cache file path for a specified url.
///
/// The url fo which to retrieve the cache path. May not be null.
/// The cache file path.
public string GetCachePath(Uri uri)
{
if (uri == null)
{
throw new ArgumentNullException("uri");
}
if (!UsingCache)
{
throw new HtmlWebException("Cache is not enabled. Set UsingCache to true first.");
}
string cachePath;
if (uri.AbsolutePath == "/")
{
cachePath = Path.Combine(_cachePath, ".htm");
}
else
{
cachePath = Path.Combine(_cachePath, (uri.Host + uri.AbsolutePath).Replace('/', '\\'));
}
return cachePath;
}
///
/// Gets a value indicating if the last document was retrieved from the cache.
///
public bool FromCache
{
get
{
return _fromCache;
}
}
///
/// Gets the URI of the Internet resource that actually responded to the request.
///
public Uri ResponseUri
{
get
{
return _responseUri;
}
}
///
/// Gets or Sets a value indicating whether to get document only from the cache.
/// If this is set to true and document is not found in the cache, nothing will be loaded.
///
public bool CacheOnly
{
get
{
return _cacheOnly;
}
set
{
if ((value) && !UsingCache)
{
throw new HtmlWebException("Cache is not enabled. Set UsingCache to true first.");
}
_cacheOnly = value;
}
}
///
/// Gets or Sets a value indicating if cookies will be stored.
///
public bool UseCookies
{
get
{
return _useCookies;
}
set
{
_useCookies = value;
}
}
///
/// Gets the last request duration in milliseconds.
///
public int RequestDuration
{
get
{
return _requestDuration;
}
}
///
/// Gets or Sets a value indicating if document encoding must be automatically detected.
///
public bool AutoDetectEncoding
{
get
{
return _autoDetectEncoding;
}
set
{
_autoDetectEncoding = value;
}
}
///
/// Gets the last request status.
///
public HttpStatusCode StatusCode
{
get
{
return _statusCode;
}
}
///
/// Gets or Sets the size of the buffer used for memory operations.
///
public int StreamBufferSize
{
get
{
return _streamBufferSize;
}
set
{
if (_streamBufferSize <= 0)
{
throw new ArgumentException("Size must be greater than zero.");
}
_streamBufferSize = value;
}
}
private HtmlDocument LoadUrl(Uri uri, string method)
{
HtmlDocument doc = new HtmlDocument();
doc.OptionAutoCloseOnEnd = false;
doc.OptionFixNestedTags = true;
_statusCode = Get(uri, method, null, doc);
if (_statusCode == HttpStatusCode.NotModified)
{
// read cached encoding
doc.DetectEncodingAndLoad(GetCachePath(uri));
}
return doc;
}
private HttpStatusCode Get(Uri uri, string method, string path, HtmlDocument doc)
{
string cachePath = null;
HttpWebRequest req;
bool oldFile = false;
req = WebRequest.Create(uri) as HttpWebRequest;
req.Method = method;
_fromCache = false;
_requestDuration = 0;
int tc = Environment.TickCount;
if (UsingCache)
{
cachePath = GetCachePath(req.RequestUri);
if (File.Exists(cachePath))
{
req.IfModifiedSince = File.GetLastAccessTime(cachePath);
oldFile = true;
}
}
if (_cacheOnly)
{
if (!File.Exists(cachePath))
{
throw new HtmlWebException("File was not found at cache path: '" + cachePath + "'");
}
if (path != null)
{
IOLibrary.CopyAlways(cachePath, path);
// touch the file
File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
}
_fromCache = true;
return HttpStatusCode.NotModified;
}
if (_useCookies)
{
req.CookieContainer = new CookieContainer();
}
if (PreRequest != null)
{
// allow our user to change the request at will
if (!PreRequest(req))
{
return HttpStatusCode.ResetContent;
}
// dump cookie
// if (_useCookies)
// {
// foreach(Cookie cookie in req.CookieContainer.GetCookies(req.RequestUri))
// {
// HtmlLibrary.Trace("Cookie " + cookie.Name + "=" + cookie.Value + " path=" + cookie.Path + " domain=" + cookie.Domain);
// }
// }
}
HttpWebResponse resp;
try
{
resp = req.GetResponse() as HttpWebResponse;
}
catch (WebException we)
{
_requestDuration = Environment.TickCount - tc;
resp = (HttpWebResponse)we.Response;
if (resp == null)
{
if (oldFile)
{
if (path != null)
{
IOLibrary.CopyAlways(cachePath, path);
// touch the file
File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
}
return HttpStatusCode.NotModified;
}
throw;
}
}
catch(Exception)
{
_requestDuration = Environment.TickCount - tc;
throw;
}
// allow our user to get some info from the response
if (PostResponse != null)
{
PostResponse(req, resp);
}
_requestDuration = Environment.TickCount - tc;
_responseUri = resp.ResponseUri;
bool html = IsHtmlContent(resp.ContentType);
System.Text.Encoding respenc;
if ((resp.ContentEncoding != null) && (resp.ContentEncoding.Length>0))
{
respenc = System.Text.Encoding.GetEncoding(resp.ContentEncoding);
}
else
{
respenc = null;
}
if (resp.StatusCode == HttpStatusCode.NotModified)
{
if (UsingCache)
{
_fromCache = true;
if (path != null)
{
IOLibrary.CopyAlways(cachePath, path);
// touch the file
File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
}
return resp.StatusCode;
}
else
{
// this should *never* happen...
throw new HtmlWebException("Server has send a NotModifed code, without cache enabled.");
}
}
Stream s = resp.GetResponseStream();
if (s != null)
{
if (UsingCache)
{
// NOTE: LastModified does not contain milliseconds, so we remove them to the file
SaveStream(s, cachePath, RemoveMilliseconds(resp.LastModified), _streamBufferSize);
// save headers
SaveCacheHeaders(req.RequestUri, resp);
if (path != null)
{
// copy and touch the file
IOLibrary.CopyAlways(cachePath, path);
File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
}
}
else
{
// try to work in-memory
if ((doc != null) && (html))
{
if (respenc != null)
{
doc.Load(s,respenc);
}
else
{
doc.Load(s);
}
}
}
resp.Close();
}
return resp.StatusCode;
}
private string GetCacheHeader(Uri requestUri, string name, string def)
{
// note: some headers are collection (ex: www-authenticate)
// we don't handle that here
XmlDocument doc = new XmlDocument();
doc.Load(GetCacheHeadersPath(requestUri));
XmlNode node = doc.SelectSingleNode("//h[translate(@n, 'abcdefghijklmnopqrstuvwxyz','ABCDEFGHIJKLMNOPQRSTUVWXYZ')='" + name.ToUpper() + "']");
if (node == null)
{
return def;
}
// attribute should exist
return node.Attributes[name].Value;
}
private void SaveCacheHeaders(Uri requestUri, HttpWebResponse resp)
{
// we cache the original headers aside the cached document.
string file = GetCacheHeadersPath(requestUri);
XmlDocument doc = new XmlDocument();
doc.LoadXml("");
XmlNode cache = doc.FirstChild;
foreach(string header in resp.Headers)
{
XmlNode entry = doc.CreateElement("h");
XmlAttribute att = doc.CreateAttribute("n");
att.Value = header;
entry.Attributes.Append(att);
att = doc.CreateAttribute("v");
att.Value = resp.Headers[header];
entry.Attributes.Append(att);
cache.AppendChild(entry);
}
doc.Save(file);
}
private static long SaveStream(Stream stream, string path, DateTime touchDate, int streamBufferSize)
{
FilePreparePath(path);
FileStream fs = new FileStream(path, FileMode.Create, FileAccess.Write);
BinaryReader br = null;
BinaryWriter bw = null;
long len;
try
{
br = new BinaryReader(stream);
bw = new BinaryWriter(fs);
len = 0;
byte[] buffer;
do
{
buffer = br.ReadBytes(streamBufferSize);
len += buffer.Length;
if (buffer.Length>0)
{
bw.Write(buffer);
}
}
while (buffer.Length>0);
}
finally
{
if (br != null)
{
br.Close();
}
if (bw != null)
{
bw.Flush();
bw.Close();
}
if (fs != null)
{
fs.Close();
}
}
File.SetLastWriteTime(path, touchDate);
return len;
}
private static void FilePreparePath(string target)
{
if (File.Exists(target))
{
FileAttributes atts = File.GetAttributes(target);
File.SetAttributes(target, atts & ~FileAttributes.ReadOnly);
}
else
{
string dir = Path.GetDirectoryName(target);
if (!Directory.Exists(dir))
{
Directory.CreateDirectory(dir);
}
}
}
private static DateTime RemoveMilliseconds(DateTime t)
{
return new DateTime(t.Year, t.Month, t.Day, t.Hour, t.Minute, t.Second, 0);
}
///
/// Gets the path extension for a given MIME content type.
///
/// The input MIME content type.
/// The default path extension to return if any error occurs.
/// The MIME content type's path extension.
public static string GetExtensionForContentType(string contentType, string def)
{
if ((contentType == null) || (contentType.Length == 0))
{
return def;
}
string ext;
try
{
RegistryKey reg = Registry.ClassesRoot;
reg = reg.OpenSubKey(@"MIME\Database\Content Type\" + contentType, false);
ext = (string)reg.GetValue("Extension", def);
}
catch(Exception)
{
ext = def;
}
return ext;
}
///
/// Gets the MIME content type for a given path extension.
///
/// The input path extension.
/// The default content type to return if any error occurs.
/// The path extention's MIME content type.
public static string GetContentTypeForExtension(string extension, string def)
{
if ((extension == null) || (extension.Length == 0))
{
return def;
}
string contentType;
try
{
RegistryKey reg = Registry.ClassesRoot;
reg = reg.OpenSubKey(extension, false);
contentType = (string)reg.GetValue("", def);
}
catch(Exception)
{
contentType = def;
}
return contentType;
}
///
/// Loads an HTML document from an Internet resource and saves it to the specified XmlTextWriter.
///
/// The requested URL, such as "http://Myserver/Mypath/Myfile.asp".
/// The XmlTextWriter to which you want to save.
public void LoadHtmlAsXml(string htmlUrl, XmlTextWriter writer)
{
HtmlDocument doc = Load(htmlUrl);
doc.Save(writer);
}
///
/// Loads an HTML document from an Internet resource and saves it to the specified XmlTextWriter, after an XSLT transformation.
///
/// The requested URL, such as "http://Myserver/Mypath/Myfile.asp".
/// The URL that specifies the XSLT stylesheet to load.
/// An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.
/// The XmlTextWriter to which you want to save.
public void LoadHtmlAsXml(string htmlUrl, string xsltUrl, XsltArgumentList xsltArgs, XmlTextWriter writer)
{
LoadHtmlAsXml(htmlUrl, xsltUrl, xsltArgs, writer, null);
}
///
/// Loads an HTML document from an Internet resource and saves it to the specified XmlTextWriter, after an XSLT transformation.
///
/// The requested URL, such as "http://Myserver/Mypath/Myfile.asp". May not be null.
/// The URL that specifies the XSLT stylesheet to load.
/// An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.
/// The XmlTextWriter to which you want to save.
/// A file path where the temporary XML before transformation will be saved. Mostly used for debugging purposes.
public void LoadHtmlAsXml(string htmlUrl, string xsltUrl, XsltArgumentList xsltArgs, XmlTextWriter writer, string xmlPath)
{
if (htmlUrl == null)
{
throw new ArgumentNullException("htmlUrl");
}
HtmlDocument doc = Load(htmlUrl);
if (xmlPath != null)
{
XmlTextWriter w = new XmlTextWriter(xmlPath, doc.Encoding);
doc.Save(w);
w.Close();
}
if (xsltArgs == null)
{
xsltArgs = new XsltArgumentList();
}
// add some useful variables to the xslt doc
xsltArgs.AddParam("url", "", htmlUrl);
xsltArgs.AddParam("requestDuration", "", RequestDuration);
xsltArgs.AddParam("fromCache", "", FromCache);
XslTransform xslt = new XslTransform();
xslt.Load(xsltUrl);
xslt.Transform(doc, xsltArgs, writer);
}
///
/// Creates an instance of the given type from the specified Internet resource.
///
/// The requested URL, such as "http://Myserver/Mypath/Myfile.asp".
/// The requested type.
/// An newly created instance.
public object CreateInstance(string url, Type type)
{
return CreateInstance(url, null, null, type);
}
///
/// Creates an instance of the given type from the specified Internet resource.
///
/// The requested URL, such as "http://Myserver/Mypath/Myfile.asp".
/// The URL that specifies the XSLT stylesheet to load.
/// An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.
/// The requested type.
/// An newly created instance.
public object CreateInstance(string htmlUrl, string xsltUrl, XsltArgumentList xsltArgs, Type type)
{
return CreateInstance(htmlUrl, xsltUrl, xsltArgs, type, null);
}
///
/// Creates an instance of the given type from the specified Internet resource.
///
/// The requested URL, such as "http://Myserver/Mypath/Myfile.asp".
/// The URL that specifies the XSLT stylesheet to load.
/// An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.
/// The requested type.
/// A file path where the temporary XML before transformation will be saved. Mostly used for debugging purposes.
/// An newly created instance.
public object CreateInstance(string htmlUrl, string xsltUrl, XsltArgumentList xsltArgs, Type type, string xmlPath)
{
StringWriter sw = new StringWriter();
XmlTextWriter writer = new XmlTextWriter(sw);
if (xsltUrl == null)
{
LoadHtmlAsXml(htmlUrl, writer);
}
else
{
if (xmlPath == null)
{
LoadHtmlAsXml(htmlUrl, xsltUrl, xsltArgs, writer);
}
else
{
LoadHtmlAsXml(htmlUrl, xsltUrl, xsltArgs, writer, xmlPath);
}
}
writer.Flush();
StringReader sr = new StringReader(sw.ToString());
XmlTextReader reader = new XmlTextReader(sr);
XmlSerializer serializer = new XmlSerializer(type);
object o = null;
try
{
o = serializer.Deserialize(reader);
}
catch(InvalidOperationException ex)
{
throw new Exception(ex.ToString() + ", --- xml:" + sw.ToString());
}
return o;
}
///
/// Gets or Sets the cache path. If null, no caching mechanism will be used.
///
public string CachePath
{
get
{
return _cachePath;
}
set
{
_cachePath = value;
}
}
///
/// Gets or Sets a value indicating whether the caching mechanisms should be used or not.
///
public bool UsingCache
{
get
{
if (_cachePath == null)
{
return false;
}
return _usingCache;
}
set
{
if ((value) && (_cachePath == null))
{
throw new HtmlWebException("You need to define a CachePath first.");
}
_usingCache = value;
}
}
}
///
/// Represents an exception thrown by the HtmlWeb utility class.
///
public class HtmlWebException: Exception
{
///
/// Creates an instance of the HtmlWebException.
///
/// The exception's message.
public HtmlWebException(string message)
:base(message)
{
}
}
}