// // FilterHtml.cs // // Copyright (C) 2005 Debajyoti Bera // Copyright (C) 2004 Novell, Inc. // // // Permission is hereby granted, free of charge, to any person obtaining a // copy of this software and associated documentation files (the "Software"), // to deal in the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS IN THE SOFTWARE. // using System; using System.Collections; using System.IO; using System.Text; using SW=System.Web; using Beagle.Daemon; using Beagle.Util; using HtmlAgilityPack; namespace Beagle.Filters { public class FilterHtml : Beagle.Daemon.Filter { // When see push "b" in the stack // When see pop from the stack // For good error checking, we should compare // current element with what was popped // Currently, we just pop, this might allow // unmatched elements to pass through private Stack hot_stack; private Stack ignore_stack; private bool building_text; private StringBuilder builder; public FilterHtml () { // 1: Add meta keyword fields as meta:key SetVersion (1); RegisterSupportedTypes (); SnippetMode = true; hot_stack = new Stack (); ignore_stack = new Stack (); building_text = false; builder = new StringBuilder (); } // Safeguard against spurious stack pop ups... // caused by mismatched tags in bad html files // FIXME: If matching elements is not required // and if HtmlAgilityPack matches elements itself, // then we can just use a counter hot_stack_depth // instead of the hot_stack private void SafePop (Stack st) { if (st != null && st.Count != 0) st.Pop (); } protected bool NodeIsHot (String nodeName) { return nodeName == "b" || nodeName == "u" || nodeName == "em" || nodeName == "strong" || nodeName == "big" || nodeName == "h1" || nodeName == "h2" || nodeName == "h3" || nodeName == "h4" || nodeName == "h5" || nodeName == "h6" || nodeName == "i" || nodeName == "th"; } protected static bool NodeBreaksText (String nodeName) { return nodeName == "td" || nodeName == "a" || nodeName == "div" || nodeName == "option"; } protected static bool NodeBreaksStructure (string nodeName) { return nodeName == "p" || nodeName == "br" || nodeName == "h1" || nodeName == "h2" || nodeName == "h3" || nodeName == "h4" || nodeName == "h5" || nodeName == "h6"; } protected static bool NodeIsContentFree (String nodeName) { return nodeName == "script" || nodeName == "map" || nodeName == "style"; } protected bool HandleNodeEvent (HtmlNode node) { switch (node.NodeType) { case HtmlNodeType.Document: case HtmlNodeType.Element: if (node.Name == "title") { if (node.StartTag) { builder.Length = 0; building_text = true; } else { String title = HtmlEntity.DeEntitize (builder.ToString ().Trim ()); AddProperty (Beagle.Property.New ("dc:title", title)); builder.Length = 0; building_text = false; } } else if (node.Name == "meta") { string name = node.GetAttributeValue ("name", ""); string content = node.GetAttributeValue ("content", ""); if (name != "" && content != "") AddProperty (Beagle.Property.New ("meta:" + name, content)); } else if (! NodeIsContentFree (node.Name)) { bool isHot = NodeIsHot (node.Name); bool breaksText = NodeBreaksText (node.Name); bool breaksStructure = NodeBreaksStructure (node.Name); if (isHot && node.StartTag) { if (hot_stack.Count == 0) HotUp (); hot_stack.Push (node.Name); } if (breaksText && node.StartTag) AppendWhiteSpace (); if (node.Name == "img" && node.StartTag) { string attr = node.GetAttributeValue ("alt", ""); if (attr != "") { AppendText (HtmlEntity.DeEntitize (attr)); } } if (node.Name == "a" && node.StartTag) { string attr = node.GetAttributeValue ("href", ""); if (attr != "") { AppendText (HtmlEntity.DeEntitize (SW.HttpUtility.UrlDecode (attr))); } } if (breaksText && !node.StartTag) AppendWhiteSpace (); if (breaksStructure && !node.StartTag) AppendStructuralBreak (); if (isHot && !node.StartTag) { if (hot_stack.Count != 0) SafePop (hot_stack); if (hot_stack.Count == 0) HotDown (); } } else { // so node is a content-free node // ignore contents of such node if (node.StartTag) ignore_stack.Push (node.Name); else SafePop (ignore_stack); } break; case HtmlNodeType.Text: // FIXME Do we need to trim the text ? String text = ((HtmlTextNode)node).Text; if (ignore_stack.Count != 0) break; // still ignoring ... if (building_text) builder.Append (text); else AppendText (HtmlEntity.DeEntitize (text)); //if (hot_stack.Count != 0) //Console.WriteLine (" TEXT:" + text + " ignore=" + ignore_stack.Count); break; } if (! AllowMoreWords ()) return false; return true; } override protected void DoOpen (FileInfo info) { Encoding enc = null; foreach (Property prop in IndexableProperties) { if (prop.Key != StringFu.UnindexedNamespace + "encoding") continue; try { enc = Encoding.GetEncoding ((string) prop.Value); } catch (NotSupportedException) { // Encoding passed in isn't supported. Maybe // we'll get lucky detecting it from the // document instead. } break; } if (enc == null) { // we need to tell the parser to detect encoding, HtmlDocument temp_doc = new HtmlDocument (); enc = temp_doc.DetectEncoding (Stream); //Console.WriteLine ("Detected encoding:" + (enc == null ? "null" : enc.EncodingName)); temp_doc = null; Stream.Seek (0, SeekOrigin.Begin); } HtmlDocument doc = new HtmlDocument (); doc.ReportNode += HandleNodeEvent; doc.StreamMode = true; // we already determined encoding doc.OptionReadEncoding = false; try { if (enc == null) doc.Load (Stream); else doc.Load (Stream, enc); } catch (NotSupportedException e) { doc.Load (Stream, Encoding.ASCII); } catch (Exception e) { Console.WriteLine (e.Message); Console.WriteLine (e.StackTrace); } Finished (); } virtual protected void RegisterSupportedTypes () { AddSupportedFlavor (FilterFlavor.NewFromMimeType ("text/html")); } } }