在C#中解析具有Html敏捷性的表格,单元格

我需要解析Html代码。 更具体地说,解析所有表中每行的每个单元格。 每行代表一个对象,每个单元代表不同的属性。 我想解析这些,以便能够写入包含每个数据的XML文件(没有无用的HTML代码)。 我已经成功地解析了HTML文件中的每一列,但现在我不知道将这个列写入XML文件的选项。 我很困惑。

HTML:

  1   Sidney Crosby   PIT   C   39   32   33    65    20   29   10   1   3     0   154   20.8   21:54   22.6   55.7   

C#:

 using HtmlAgilityPack; namespace Stats { class StatsParser { private string htmlCode; private static string fileName = "[" + DateTime.Now.ToShortDateString() + " NHL Stats].xml"; public StatsParser(string htmlCode) { this.htmlCode = htmlCode; this.ParseHtml(); } public void ParseHtml() { HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(htmlCode); try { // Get all tables in the document HtmlNodeCollection tables = doc.DocumentNode.SelectNodes("//table"); // Iterate all rows in the first table HtmlNodeCollection rows = tables[0].SelectNodes(".//tr"); for (int i = 0; i < rows.Count; ++i) { // Iterate all columns in this row HtmlNodeCollection cols = rows[i].SelectNodes(".//td[@class='statBox']"); for (int j = 0; j < cols.Count; ++j) { // Get the value of the column and print it string value = cols[j].InnerText; if (value!="") System.Windows.MessageBox.Show(value); } } } catch (NullReferenceException) { System.Windows.Forms.MessageBox.Show("Exception!!"); } } 

XML:

    Sidney Crosby PIT C 39 32 33   

环顾MSDN后,我终于找到了解决问题的实现方案:

  using System; using HtmlAgilityPack; using System.Xml; namespace HockeyStats { class StatsParser { private string htmlCode; private static string fileName = "[" + DateTime.Now.ToShortDateString() + " NHL Stats].xml"; public StatsParser(string htmlCode) { this.htmlCode = htmlCode; this.ParseHtml(); } public void ParseHtml() { HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(htmlCode); XmlWriter writer = null; try { // Create an XmlWriterSettings object with the correct options. XmlWriterSettings settings = new XmlWriterSettings(); settings.Indent = true; settings.IndentChars = (" "); settings.OmitXmlDeclaration = false; // Create the XmlWriter object and write some content. writer = XmlWriter.Create(@"..\..\"+fileName, settings); writer.WriteStartElement("Stats"); writer.WriteAttributeString("Date", DateTime.Now.ToShortDateString()); // Iterate all rows within another row HtmlNodeCollection rows = doc.DocumentNode.SelectNodes(".//tr/tr"); for (int i = 0; i < rows.Count; ++i) { // Iterate all columns in this row HtmlNodeCollection cols = rows[i].SelectNodes(".//td[@class='statBox']"); for (int j = 0; j < 20; ++j) { switch (j) { case 0: { writer.WriteStartElement("Player"); writer.WriteAttributeString("Rank", cols[j].InnerText.Trim()); break; } case 1: writer.WriteElementString("Name", cols[j].InnerText.Trim()); break; case 2: writer.WriteElementString("Team", cols[j].InnerText.Trim()); break; case 3: writer.WriteElementString("Pos", cols[j].InnerText.Trim()); break; case 4: writer.WriteElementString("GP", cols[j].InnerText.Trim()); break; case 5: writer.WriteElementString("G", cols[j].InnerText.Trim()); break; case 6: writer.WriteElementString("A", cols[j].InnerText.Trim()); break; case 7: writer.WriteElementString("PlusMinus", cols[j].InnerText.Trim()); break; case 8: writer.WriteElementString("PIM", cols[j].InnerText); break; case 9: writer.WriteElementString("PP", cols[j].InnerText); break; case 10: writer.WriteElementString("SH", cols[j].InnerText); break; case 11: writer.WriteElementString("GW", cols[j].InnerText); break; case 12: writer.WriteElementString("OT", cols[j].InnerText); break; case 13: writer.WriteElementString("Shots", cols[j].InnerText); break; case 14: writer.WriteElementString("ShotPctg", cols[j].InnerText); break; case 15: writer.WriteElementString("TOIPerGame", cols[j].InnerText); break; case 16: writer.WriteElementString("ShiftsPerGame", cols[j].InnerText); break; case 17: writer.WriteElementString("FOWinPctg", cols[j].InnerText); break; } } } writer.WriteEndElement(); } writer.WriteEndElement(); writer.Flush(); } finally { if (writer != null) writer.Close(); } } } } 

它提供以下XML文件作为输出:

    Sidney Crosby PIT C 39 32 33 20 29 10 1 3 0 154 20.8 21:54 22.6   

我在评论中的意思是你在代码(嵌套循环)中做了正确的XPath可以为你做什么。 使用LINQ-to-XML可以使编写更加简单。 但是现在我们看到您希望如何格式化XML文件,我们可以提供自己的答案。 我会像这样编写ParseHtml()方法:

 public void ParseHtml() { var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(htmlCode); var cells = htmlDoc.DocumentNode // use the right XPath rather than looping manually .SelectNodes(@"//tr/tr/td[@class='statBox']") .Select(node => node.InnerText.Trim()) .ToList(); var elementNames = new[] { "Name", "Team", "Pos", "GP", "G", "A", "PlusMinus", "PIM", "PP", "SH", "GW", "OT", "Shots", "ShotPctg", "TOIPerGame", "ShiftsPerGame", "FOWinPctg", "UnknownField" }; var xmlDoc = new XElement("Stats", new XAttribute("Date", DateTime.Now.ToShortDateString()), new XElement("Player", new XAttribute("Rank", cells.First()), // generate the elements based on the parsed cells cells.Skip(1) .Zip(elementNames, (Value, Name) => new XElement(Name, Value)) .Where(element => !String.IsNullOrEmpty(element.Value)) ) ); // save to your file xmlDoc.Save(filepath); } 

产生输出:

    Sidney Crosby PIT C 39 32 33 20 29 10 1 3 0 154 20.8 21:54 22.6 55.7