我需要一个强大的Web Scraper库

我需要一个强大的Web scraper库来从web挖掘内容。 可以支付或免费两者对我来说都没问题。 请建议我使用库或更好的方法来挖掘数据并存储在我的首选数据库中。 我搜索过,但我没有找到任何好的解决方案。 我需要专家的好建议。 请帮帮我。

刮刮很容易,你只需要解析你正在下载的内容并获得所有相关的链接。

但最重要的部分是处理HTML的部分。 因为大多数浏览器不需要最干净(或符合标准)的HTML才能进行渲染,所以您需要一个HTML解析器,它能够理解并不总是格式良好的HTML。

我建议您使用HTML Agility Pack来实现此目的。 它在处理非格式良好的HTML方面表现非常出色,并为您提供了一个简单的界面,您可以使用XPath查询在结果文档中获取节点。

除此之外,您只需选择一个数据存储来保存已处理的数据(您可以使用任何数据库技术)以及从Web下载内容的方法,.NET为WebClient提供了两种高级机制。 HttpWebRequest / HttpWebResponse类。

using System; using System.Collections.Generic; using System.Linq; using System.Text; namespace SoftCircuits.Parsing { public class HtmlTag { ///  /// Name of this tag ///  public string Name { get; set; } ///  /// Collection of attribute names and values for this tag ///  public Dictionary Attributes { get; set; } ///  /// True if this tag contained a trailing forward slash ///  public bool TrailingSlash { get; set; } ///  /// Indicates if this tag contains the specified attribute. Note that /// true is returned when this tag contains the attribute even when the /// attribute has no value ///  /// Name of attribute to check /// True if tag contains attribute or false otherwise public bool HasAttribute(string name) { return Attributes.ContainsKey(name); } }; public class HtmlParser : TextParser { public HtmlParser() { } public HtmlParser(string html) : base(html) { } ///  /// Parses the next tag that matches the specified tag name ///  /// Name of the tags to parse ("*" = parse all tags) /// Returns information on the next occurrence of the specified tag or null if none found /// True if a tag was parsed or false if the end of the document was reached public bool ParseNext(string name, out HtmlTag tag) { // Must always set out parameter tag = null; // Nothing to do if no tag specified if (String.IsNullOrEmpty(name)) return false; // Loop until match is found or no more tags MoveTo('<'); while (!EndOfText) { // Skip over opening '<' MoveAhead(); // Examine first tag character char c = Peek(); if (c == '!' && Peek(1) == '-' && Peek(2) == '-') { // Skip over comments const string endComment = "-->"; MoveTo(endComment); MoveAhead(endComment.Length); } else if (c == '/') { // Skip over closing tags MoveTo('>'); MoveAhead(); } else { bool result, inScript; // Parse tag result = ParseTag(name, ref tag, out inScript); // Because scripts may contain tag characters, we have special // handling to skip over script contents if (inScript) MovePastScript(); // Return true if requested tag was found if (result) return true; } // Find next tag MoveTo('<'); } // No more matching tags found return false; } ///  /// Parses the contents of an HTML tag. The current position should be at the first /// character following the tag's opening less-than character. /// /// Note: We parse to the end of the tag even if this tag was not requested by the /// caller. This ensures subsequent parsing takes place after this tag ///  /// Name of the tag the caller is requesting, or "*" if caller /// is requesting all tags /// Returns information on this tag if it's one the caller is /// requesting /// Returns true if tag began, and did not end, and script /// block /// True if data is being returned for a tag requested by the caller /// or false otherwise protected bool ParseTag(string reqName, ref HtmlTag tag, out bool inScript) { bool doctype, requested; doctype = inScript = requested = false; // Get name of this tag string name = ParseTagName(); // Special handling if (String.Compare(name, "!DOCTYPE", true) == 0) doctype = true; else if (String.Compare(name, "script", true) == 0) inScript = true; // Is this a tag requested by caller? if (reqName == "*" || String.Compare(name, reqName, true) == 0) { // Yes requested = true; // Create new tag object tag = new HtmlTag(); tag.Name = name; tag.Attributes = new Dictionary(StringComparer.OrdinalIgnoreCase); } // Parse attributes MovePastWhitespace(); while (Peek() != '>' && Peek() != NullChar) { if (Peek() == '/') { // Handle trailing forward slash if (requested) tag.TrailingSlash = true; MoveAhead(); MovePastWhitespace(); // If this is a script tag, it was closed inScript = false; } else { // Parse attribute name name = (!doctype) ? ParseAttributeName() : ParseAttributeValue(); MovePastWhitespace(); // Parse attribute value string value = String.Empty; if (Peek() == '=') { MoveAhead(); MovePastWhitespace(); value = ParseAttributeValue(); MovePastWhitespace(); } // Add attribute to collection if requested tag if (requested) { // This tag replaces existing tags with same name if (tag.Attributes.ContainsKey(name)) tag.Attributes.Remove(name); tag.Attributes.Add(name, value); } } } // Skip over closing '>' MoveAhead(); return requested; } ///  /// Parses a tag name. The current position should be the first character of the name ///  /// Returns the parsed name string protected string ParseTagName() { int start = Position; while (!EndOfText && !Char.IsWhiteSpace(Peek()) && Peek() != '>') MoveAhead(); return Substring(start, Position); } ///  /// Parses an attribute name. The current position should be the first character /// of the name ///  /// Returns the parsed name string protected string ParseAttributeName() { int start = Position; while (!EndOfText && !Char.IsWhiteSpace(Peek()) && Peek() != '>' && Peek() != '=') MoveAhead(); return Substring(start, Position); } ///  /// Parses an attribute value. The current position should be the first non-whitespace /// character following the equal sign. /// /// Note: We terminate the name or value if we encounter a new line. This seems to /// be the best way of handling errors such as values missing closing quotes, etc. ///  /// Returns the parsed value string protected string ParseAttributeValue() { int start, end; char c = Peek(); if (c == '"' || c == '\'') { // Move past opening quote MoveAhead(); // Parse quoted value start = Position; MoveTo(new char[] { c, '\r', '\n' }); end = Position; // Move past closing quote if (Peek() == c) MoveAhead(); } else { // Parse unquoted value start = Position; while (!EndOfText && !Char.IsWhiteSpace(c) && c != '>') { MoveAhead(); c = Peek(); } end = Position; } return Substring(start, end); } ///  /// Locates the end of the current script and moves past the closing tag ///  protected void MovePastScript() { const string endScript = "' || Char.IsWhiteSpace(Peek())) { MoveTo('>'); MoveAhead(); break; } } } } } 

我的建议:

您可以四处寻找HTML Parser,然后使用它来解析站点中的信息。 (就像这里 )。 然后,您需要做的就是将数据保存到数据库中,但是您认为合适。

我已经制作了几次自己的刮刀,它非常简单,允许您自定义保存的数据。

数据挖掘工具

如果你真的只想获得一个工具来做到这一点,那么你应该没有问题找到一些 。

对于简单的网站(仅限纯HTML),Mechanize工作得非常好,速度很快。 对于使用Javascript,AJAX甚至Flash的网站,您需要一个真正的浏览器解决方案,例如iMacros。