更好的方法来检测XML？

目前，我有以下c＃代码从文本中提取值。如果是XML，我想要它里面的值 – 否则，如果它不是XML，它只能返回文本本身。

String data = "..." try { return XElement.Parse(data).Value; } catch (System.Xml.XmlException) { return data; }

我知道C＃中的exception是昂贵的，所以我想知道是否有更好的方法来确定我正在处理的文本是否是xml？

我想到了正则表达式测试，但我不认为这是一个更便宜的替代方案。注意，我要求一种更便宜的方法。

你可以对一个<进行初步检查，因为所有XML都必须从一个开始，所有非XML的大部分都不会以一个开始。

（自由写手。）

 // Has to have length to be XML if (!string.IsNullOrEmpty(data)) { // If it starts with a < after trimming then it probably is XML // Need to do an empty check again in case the string is all white space. var trimmedData = data.TrimStart(); if (string.IsNullOrEmpty(trimmedData)) { return data; } if (trimmedData[0] == '<') { try { return XElement.Parse(data).Value; } catch (System.Xml.XmlException) { return data; } } } else { return data; }

我最初使用了正则表达式，但Trim（）[0]与正则表达式相同。

下面给出的代码将匹配以下所有xml格式：

    xml data1 data2"); data3  data4 data5

这是代码：

 public class XmlExpresssion { // EXPLANATION OF EXPRESSION // < : \<{1} // text : (?\w+) : xmlTag is a backreference so that the start and end tags match // > : >{1} // xml data : (?.*) : data is a backreference used for the regex to return the element data //  // > : >{1} // (\w|\W)* : Matches attributes if any // Sample match and pattern egs // Just to show how I incrementally made the patterns so that the final pattern is well-understood // data // @"^\<{1}(?\w+)\>{1}.*\<{1}/{1}\k\>{1}$"; // // @"^\<{1}(?\w+)\s*/{1}\>{1}$"; //data or  // @"^\<{1}(?\w+)((\>{1}.*\<{1}/{1}\k)|(\s*/{1}))\>{1}$"; //data or  or xml data or data // @"^\<{1}(?\w+)(((\w|\W)*\>{1}(?.*)\<{1}/{1}\k)|(\s*/{1}))\>{1}$"; private const string XML_PATTERN = @"^\<{1}(?\w+)(((\w|\W)*\>{1}(?.*)\<{1}/{1}\k)|(\s*/{1}))\>{1}$"; // Checks if the string is in xml format private static bool IsXml(string value) { return Regex.IsMatch(value, XML_PATTERN); } ///  /// Assigns the element value to result if the string is xml /// 
 /// true if success, false otherwise public static bool TryParse(string s, out string result) { if (XmlExpresssion.IsXml(s)) { Regex r = new Regex(XML_PATTERN, RegexOptions.Compiled); result = r.Match(s).Result("${data}"); return true; } else { result = null; return false; } } }

调用代码：

 if (!XmlExpresssion.TryParse(s, out result)) result = s; Console.WriteLine(result);

更新:(原帖在下面）科林有一个绝妙的想法，即在调用之外移动正则表达式实例化，这样它们只能创建一次。 inheritance人新计划：

 using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Xml.Linq; using System.Diagnostics; using System.Text.RegularExpressions; namespace ConsoleApplication3 { delegate String xmltestFunc(String data); class Program { static readonly int iterations = 1000000; private static void benchmark(xmltestFunc func, String data, String expectedResult) { if (!func(data).Equals(expectedResult)) { Console.WriteLine(data + ": fail"); return; } Stopwatch sw = Stopwatch.StartNew(); for (int i = 0; i < iterations; ++i) func(data); sw.Stop(); Console.WriteLine(data + ": " + (float)((float)sw.ElapsedMilliseconds / 1000)); } static void Main(string[] args) { benchmark(xmltest1, "base", "base"); benchmark(xmltest1, " base ", "base"); benchmark(xmltest1, "base", "base"); benchmark(xmltest2, "ColinBurnett", "ColinBurnett"); benchmark(xmltest2, " ColinBurnett ", "ColinBurnett"); benchmark(xmltest2, "ColinBurnett", "ColinBurnett"); benchmark(xmltest3, "Si", "Si"); benchmark(xmltest3, " Si ", "Si" ); benchmark(xmltest3, "Si", "Si"); benchmark(xmltest4, "RashmiPandit", "RashmiPandit"); benchmark(xmltest4, " RashmiPandit ", "RashmiPandit"); benchmark(xmltest4, "RashmiPandit", "RashmiPandit"); benchmark(xmltest5, "Custom", "Custom"); benchmark(xmltest5, " Custom ", "Custom"); benchmark(xmltest5, "Custom", "Custom"); // "press any key to continue" Console.WriteLine("Done."); Console.ReadLine(); } public static String xmltest1(String data) { try { return XElement.Parse(data).Value; } catch (System.Xml.XmlException) { return data; } } static Regex xmltest2regex = new Regex("^[ \t\r\n]*<"); public static String xmltest2(String data) { // Has to have length to be XML if (!string.IsNullOrEmpty(data)) { // If it starts with a < then it probably is XML // But also cover the case where there is indeterminate whitespace before the < if (data[0] == '<' || xmltest2regex.Match(data).Success) { try { return XElement.Parse(data).Value; } catch (System.Xml.XmlException) { return data; } } } return data; } static Regex xmltest3regex = new Regex(@"<(?\w*)>(?.*)>"); public static String xmltest3(String data) { Match m = xmltest3regex.Match(data); if (m.Success) { GroupCollection gc = m.Groups; if (gc.Count > 0) { return gc["text"].Value; } } return data; } public static String xmltest4(String data) { String result; if (!XmlExpresssion.TryParse(data, out result)) result = data; return result; } static Regex xmltest5regex = new Regex("^[ \t\r\n]*<"); public static String xmltest5(String data) { // Has to have length to be XML if (!string.IsNullOrEmpty(data)) { // If it starts with a < then it probably is XML // But also cover the case where there is indeterminate whitespace before the < if (data[0] == '<' || data.Trim()[0] == '<' || xmltest5regex.Match(data).Success) { try { return XElement.Parse(data).Value; } catch (System.Xml.XmlException) { return data; } } } return data; } } public class XmlExpresssion { // EXPLANATION OF EXPRESSION // < : \<{1} // text : (?\w+) : xmlTag is a backreference so that the start and end tags match // > : >{1} // xml data : (?.*) : data is a backreference used for the regex to return the element data //  // > : >{1} // (\w|\W)* : Matches attributes if any // Sample match and pattern egs // Just to show how I incrementally made the patterns so that the final pattern is well-understood // data // @"^\<{1}(?\w+)\>{1}.*\<{1}/{1}\k\>{1}$"; // // @"^\<{1}(?\w+)\s*/{1}\>{1}$"; //data or  // @"^\<{1}(?\w+)((\>{1}.*\<{1}/{1}\k)|(\s*/{1}))\>{1}$"; //data or  or xml data or data // @"^\<{1}(?\w+)(((\w|\W)*\>{1}(?.*)\<{1}/{1}\k)|(\s*/{1}))\>{1}$"; private static string XML_PATTERN = @"^\<{1}(?\w+)(((\w|\W)*\>{1}(?.*)\<{1}/{1}\k)|(\s*/{1}))\>{1}$"; private static Regex regex = new Regex(XML_PATTERN, RegexOptions.Compiled); // Checks if the string is in xml format private static bool IsXml(string value) { return regex.IsMatch(value); } ///  /// Assigns the element value to result if the string is xml /// 
 /// true if success, false otherwise public static bool TryParse(string s, out string result) { if (XmlExpresssion.IsXml(s)) { result = regex.Match(s).Result("${data}"); return true; } else { result = null; return false; } } } }

以下是新结果：

 base: 3.667 base : 3.707 base: 40.737 ColinBurnett: 3.707 ColinBurnett : 4.784 ColinBurnett: 0.413 Si: 2.016 Si : 2.141 Si: 0.087 RashmiPandit: 12.305 RashmiPandit : fail RashmiPandit: 0.131 Custom: 3.761 Custom : 3.866 Custom: 0.329 Done.

你有它。预编译的正则表达式是可行的方法，并且非常有效。

（原帖）

我拼凑了以下程序来对为这个答案提供的代码示例进行基准测试，以演示我的post的推理以及评估私有答案的速度。

没有进一步的麻烦，inheritance了该计划。

 using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Xml.Linq; using System.Diagnostics; using System.Text.RegularExpressions; namespace ConsoleApplication3 { delegate String xmltestFunc(String data); class Program { static readonly int iterations = 1000000; private static void benchmark(xmltestFunc func, String data, String expectedResult) { if (!func(data).Equals(expectedResult)) { Console.WriteLine(data + ": fail"); return; } Stopwatch sw = Stopwatch.StartNew(); for (int i = 0; i < iterations; ++i) func(data); sw.Stop(); Console.WriteLine(data + ": " + (float)((float)sw.ElapsedMilliseconds / 1000)); } static void Main(string[] args) { benchmark(xmltest1, "base", "base"); benchmark(xmltest1, " base ", "base"); benchmark(xmltest1, "base", "base"); benchmark(xmltest2, "ColinBurnett", "ColinBurnett"); benchmark(xmltest2, " ColinBurnett ", "ColinBurnett"); benchmark(xmltest2, "ColinBurnett", "ColinBurnett"); benchmark(xmltest3, "Si", "Si"); benchmark(xmltest3, " Si ", "Si" ); benchmark(xmltest3, "Si", "Si"); benchmark(xmltest4, "RashmiPandit", "RashmiPandit"); benchmark(xmltest4, " RashmiPandit ", "RashmiPandit"); benchmark(xmltest4, "RashmiPandit", "RashmiPandit"); // "press any key to continue" Console.WriteLine("Done."); Console.ReadLine(); } public static String xmltest1(String data) { try { return XElement.Parse(data).Value; } catch (System.Xml.XmlException) { return data; } } public static String xmltest2(String data) { // Has to have length to be XML if (!string.IsNullOrEmpty(data)) { // If it starts with a < then it probably is XML // But also cover the case where there is indeterminate whitespace before the < if (data[0] == '<' || new Regex("^[ \t\r\n]*<").Match(data).Success) { try { return XElement.Parse(data).Value; } catch (System.Xml.XmlException) { return data; } } } return data; } public static String xmltest3(String data) { Regex regex = new Regex(@"<(?\w*)>(?.*)>"); Match m = regex.Match(data); if (m.Success) { GroupCollection gc = m.Groups; if (gc.Count > 0) { return gc["text"].Value; } } return data; } public static String xmltest4(String data) { String result; if (!XmlExpresssion.TryParse(data, out result)) result = data; return result; } } public class XmlExpresssion { // EXPLANATION OF EXPRESSION // < : \<{1} // text : (?\w+) : xmlTag is a backreference so that the start and end tags match // > : >{1} // xml data : (?.*) : data is a backreference used for the regex to return the element data //  // > : >{1} // (\w|\W)* : Matches attributes if any // Sample match and pattern egs // Just to show how I incrementally made the patterns so that the final pattern is well-understood // data // @"^\<{1}(?\w+)\>{1}.*\<{1}/{1}\k\>{1}$"; // // @"^\<{1}(?\w+)\s*/{1}\>{1}$"; //data or  // @"^\<{1}(?\w+)((\>{1}.*\<{1}/{1}\k)|(\s*/{1}))\>{1}$"; //data or  or xml data or data // @"^\<{1}(?\w+)(((\w|\W)*\>{1}(?.*)\<{1}/{1}\k)|(\s*/{1}))\>{1}$"; private const string XML_PATTERN = @"^\<{1}(?\w+)(((\w|\W)*\>{1}(?.*)\<{1}/{1}\k)|(\s*/{1}))\>{1}$"; // Checks if the string is in xml format private static bool IsXml(string value) { return Regex.IsMatch(value, XML_PATTERN); } ///  /// Assigns the element value to result if the string is xml /// 
 /// true if success, false otherwise public static bool TryParse(string s, out string result) { if (XmlExpresssion.IsXml(s)) { Regex r = new Regex(XML_PATTERN, RegexOptions.Compiled); result = r.Match(s).Result("${data}"); return true; } else { result = null; return false; } } } }

以下是结果。每个人都执行了100万次。

 base: 3.531 base : 3.624 base: 41.422 ColinBurnett: 3.622 ColinBurnett : 16.467 ColinBurnett: 7.995 Si: 19.014 Si : 19.201 Si: 15.567

测试4花了太长时间，因为30分钟后它被认为太慢了。为了certificate它有多慢，这里是相同的测试只运行1000次。

 base: 0.004 base : 0.004 base: 0.047 ColinBurnett: 0.003 ColinBurnett : 0.016 ColinBurnett: 0.008 Si: 0.021 Si : 0.017 Si: 0.014 RashmiPandit: 3.456 RashmiPandit : fail RashmiPandit: 0 Done.

推断出一百万次执行，它需要花费3456秒，或者只需要超过57分钟。

这是一个很好的例子，为什么复杂的正则表达式是一个坏主意，如果你正在寻找有效的代码。然而，它表明在某些情况下简单的正则表达式仍然是一个很好的答案 – 即colinBurnett中的xml的小’预测试’创建了一个可能更昂贵的基本情况，（正则表达式是在案例2中创建的）但也更短的其他情况通过避免例外情况。

我发现处理你的情况是一种完全可以接受的方式（这可能也是我处理它的方式）。我在MSDN中找不到任何类型的“XElement.TryParse（string）”，所以你拥有它的方式就可以了。

除了做XElement.Parse这样的事情之外，没有办法validation文本是XML。例如，如果文本字段中缺少最后一个close-angle-bracket，则它不是有效的XML，并且您不太可能通过RegEx或文本解析发现它。 RegEx解析很可能会遗漏许多非法字符，非法序列等。

你所能做的就是缩短你的失败案例。

因此，如果您希望看到大量的非XML数据，并且XML的预期情况较少，则使用RegEx或子字符串搜索来检测尖括号可能会节省您一点时间，但我建议这仅在你是在紧密的循环中批量处理大量数据。

相反，如果这是从Web表单或winforms应用程序解析用户输入的数据，那么我认为支付Exception的成本可能比花费开发和测试工作更好，以确保您的快捷代码不会产生误报/负面结果。

目前尚不清楚从哪里获取XML（文件，流，文本框或其他地方），但请记住，空格，注释，字节顺序标记和其他内容可能会妨碍简单的规则，例如“它必须以< ”。

为什么正则表达式很贵？它不会用1块石头（匹配和解析）杀死2只鸟吗？

解析所有元素的简单示例，如果它只是一个元素，则更容易！

 Regex regex = new Regex(@"<(?\w*)>(?.*)>"); MatchCollection matches = regex.Matches(data); foreach (Match match in matches) { GroupCollection groups = match.Groups; string name = groups["tag"].Value; string value = groups["text"].Value; ... }

线索 – 所有有效的xml必须以" ”开头


 您可能必须处理字符集差异，但检查纯ASCII，utf-8和unicode将覆盖99.5％的xml。



		      	 你建议的方式是昂贵的，如果你将在一个循环中使用它，大多数的xml没有被保护，如果有价值的xml你的代码将工作就像没有exception处理…所以如果在大多数情况下你的xml是有效的，或者你没有在循环中使用它，你的代码将正常工作 



		      	 如果你想知道它是否有效，为什么不使用内置的.NetFX对象而不是从头开始写一个？ 
 希望这可以帮助， 
 法案 



		      	  Colin Burnett技术的一个变种：你可以在开头做一个简单的正则表达式来查看文本是否以标签开头，然后尝试解析它。 可能> 99％的字符串，你将以有效元素开头处理的是XML。 这样，您可以跳过完整有效XML的正则表达式处理，并且几乎在所有情况下都跳过基于exception的处理。 
 像^<[^>]+>这样的东西可能会成功。 



		      	 我不确定你的要求是否考虑了文件格式，并且这个问题被问了很久以后我碰巧搜索了类似的东西，我想让你知道什么对我有用，所以如果有人来这里可能有帮助 ：） 
 我们可以使用Path.GetExtension（filePath）并检查它是否是XML然后以其他方式使用它做任何需要的事情 



		      	 正如@JustEngland在评论中指出的那样，exception并不昂贵，调试者拦截它们可能需要时间，但通常情况下它们表现良好且做法很好。 请参阅C＃中的例外有多贵？  。 
 更好的方法是滚动自己的TryParse样式函数： 
 [System.Diagnostics.DebuggerNonUserCode] static class MyXElement { public static bool TryParse(string data, out XElement result) { try { result = XElement.Parse(data); return true; } catch (System.Xml.XmlException) { result = default(XElement); return false; } } } 
  DebuggerNonUserCode属性使调试器跳过捕获的exception以简化您的调试体验。 
 像这样使用： 
  static void Main() { var addressList = "line one~line two~line three~postcode"; var address = new XElement("Address"); var addressHtml = "" + addressList.Replace("~", "
") + ""; XElement content; if (MyXElement.TryParse(addressHtml, out content)) address.ReplaceAll(content); else address.SetValue(addressHtml); Console.WriteLine(address.ToString()); Console.ReadKey(); } } 
 我宁愿为TryParse创建一个扩展方法，但是你不能创建一个在类型而不是实例上调用的静态方法。 



		      	 怎么样，把你的字符串或对象扔进一个新的XDocument或XElement。 一切都使用ToString（）解析。



  用于WPF DataGrid的ScrollIntoView（MVVM）
  在WPF窗口中挂钩进入Windows消息循环，在内部添加白色边框
	以编程方式检查Gac中是否存在dll。如果将其显示在网格中
为什么类型约束不是方法签名的一部分？
在C＃中使用RTMP或RTSP协议
如何通过脚本使Texture2D可读
递归调用buff / unbuff？  C＃Unity3D
从数据库中检索mp3文件
如何基于多个输入过滤DataView
在.NET中有效地重定向标准输出
如何在j＃dot中将json字符串反序列化为对象列表

更好的方法来检测XML？

为什么静态构造函数没有任何参数

右键单击以选择ListBox中的项目

将linq连接的左侧或右侧填充为相同的行数

你能在Bitmap图像中将一种颜色更改为另一种颜色吗？

动态地在每行之后重复标题行

如何编译C＃字符串插值？

如何快速将数据库中的数据导入对象列表（使用entity framework）

如何使用Pkcs11Interop与NitroKey HSM来获取与EC的共享密钥

MVC 2 RC 2中的IValueProvider

在LINQ中获取结果函数而不转换为存储表达式