在RTF中提取嵌入式图像对象

我有包含嵌入对象(图像)的rtf文档。 我需要将其提取为Image对象(或任何其他可用格式)。 我已经查看了这篇CodeProject文章,但默认的应用程序没有正确呈现它们(它们呈现’默认图像’图像,而不是图像本身),所以我继续前进。

这是写字板中RTF的屏幕

以下是RTF代码的示例(由于大小,我不得不缩短它):

 {\rtf1\ansi\deff0{\fonttbl{\f0\fnil\fcharset0 MS Sans Serif;}} \viewkind4\uc1\pard\lang1033\f0\fs18{\object\objemb{\*\objclass Package}\objw855\objh810{\*\objdata 01050000 02000000 08000000 5061636b61676500 00000000 00000000 1f900000 02007369675f5f2e6a706700433a5c55736572735c726563657074696f6e5c4465736b746f705c 5369676e6174757265735c7369675f5f2e6a7067000000030034000000433a5c55736572735c52 45434550547e315c417070446174615c4c6f63616c5c54656d705c7369675f5f20283132292e6a 706700c18e0000ffd8ffe000104a46494600010101004800470000ffdb00430001010101010101 010101010101010101010101010101010101010101010101010101010101010101010101010101 010101010101010101010101010101010101ffdb00430101010101010101010101010101010101 010101010101010101010101010101010101010101010101010101010101010101010101010101 010101010101010101ffc0001108012c03e803012200021101031101ffc4001f00010002030002 0301000000000000000000090a07080b050602030401ffc4003f10000006030001040201030301 04070900000203040506010708090a11121314152116172223314118192532591a24576598d6d8 2933384651788497b7ffc4001a010101000301010000000000000000000000030204050106ffc4 002b11010003010100020103030402030000000002030401051112130614211522230731415124 32536162ffda000c03010002110311003f00bfc000000000000000000000000000000000000000 000000000000000000000000000000000000000000000000000000000000000000000000000000 000000000000000000000000000000000000000000000000000000000000000000000000000000 000000000000000000000000000000000000000000000000000000000000000000000000000000 000000000000000000000000000000000000000000000000000000000000000000000000000000 000000000000000000000000000000000000000000000000000000000000000000000000000000 000000000000000000000000000000000000000000000000000000000000000000000000000000 000000000000000000000000000000000000000000000000000000000000000000000000000000 000000000000000000000000000000000000000000000000000000000000000000000000000000 000000000000000000000000000000000000000000000000000000000000000000000000000000 ... 005c0072006500630065007000740069006f006e005c004400650073006b0074006f0070005c00 5300690067006e006100740075007200650073005c007300690067005f005f002e006a00700067 00 01050000 00000000 }{\result{\pict\wmetafile8\picw2010\pich1905\picwgoal855\pichgoal810 0100090000033b0700000200210600000000050000000b0200000000050000000c02350038001c 000000fb02f4ff000000000000900100000001000000005365676f65205549000e0a52104c2308 00dd1900d894ef758001f3758d0e664a040000002d010000050000000902000000000500000001 02ffffff00a5000000410bc600880020002000000000002000200000000c002800000020000000 400000000100010000000000000100000000000000000000000000000000000000000000ffffff ... 0021001c001c000000fb021000070000000000bc02000000000102022253797374656d00008d0e 664a00000a0022008a0100000000ffffffff8cdd1900040000002d010100030000000000 }}}\par } 

这是一段代码,可以从RTF流中提取所有对象(’Package’类对象):

  public static void ExtractPackageObjects(string filePath) { using (StreamReader sr = new StreamReader(filePath)) { RtfReader reader = new RtfReader(sr); IEnumerator enumerator = reader.Read().GetEnumerator(); while(enumerator.MoveNext()) { if (enumerator.Current.Text == "object") { if (RtfReader.MoveToNextControlWord(enumerator, "objclass")) { string className = RtfReader.GetNextText(enumerator); if (className == "Package") { if (RtfReader.MoveToNextControlWord(enumerator, "objdata")) { byte[] data = RtfReader.GetNextTextAsByteArray(enumerator); using (MemoryStream packageData = new MemoryStream()) { RtfReader.ExtractObjectData(new MemoryStream(data), packageData); packageData.Position = 0; PackagedObject po = PackagedObject.Extract(packageData); File.WriteAllBytes(po.DisplayName, po.Data); } } } } } } } } 

以下是此代码使用的实用程序类。 有一个简单的基于流的RTF解析器,允许访问有趣的控制字。

还有一个实用程序从序列化的Object Packager实例中提取数据。 Object Packager是一个差不多20年前的OLE1.0事件,序列化的二进制格式没有记录(据我所知),但它是可以理解的。

这适用于您提供的示例,但您可能需要调整周围的事情。

 public class RtfReader { public RtfReader(TextReader reader) { if (reader == null) throw new ArgumentNullException("reader"); Reader = reader; } public TextReader Reader { get; private set; } public IEnumerable Read() { StringBuilder controlWord = new StringBuilder(); StringBuilder text = new StringBuilder(); Stack stack = new Stack(); RtfParseState state = RtfParseState.Group; do { int i = Reader.Read(); if (i < 0) { if (!string.IsNullOrWhiteSpace(controlWord.ToString())) yield return new RtfControlWord(controlWord.ToString()); if (!string.IsNullOrWhiteSpace(text.ToString())) yield return new RtfText(text.ToString()); yield break; } char c = (char)i; // noise chars if ((c == '\r') || (c == '\n')) continue; switch (state) { case RtfParseState.Group: if (c == '{') { stack.Push(state); break; } if (c == '\\') { state = RtfParseState.ControlWord; break; } break; case RtfParseState.ControlWord: if (c == '\\') { // another controlWord if (!string.IsNullOrWhiteSpace(controlWord.ToString())) { yield return new RtfControlWord(controlWord.ToString()); controlWord.Clear(); } break; } if (c == '{') { // a new group state = RtfParseState.Group; if (!string.IsNullOrWhiteSpace(controlWord.ToString())) { yield return new RtfControlWord(controlWord.ToString()); controlWord.Clear(); } break; } if (c == '}') { // close group state = stack.Count > 0 ? stack.Pop() : RtfParseState.Group; if (!string.IsNullOrWhiteSpace(controlWord.ToString())) { yield return new RtfControlWord(controlWord.ToString()); controlWord.Clear(); } break; } if (!Char.IsLetterOrDigit(c)) { state = RtfParseState.Text; text.Append(c); if (!string.IsNullOrWhiteSpace(controlWord.ToString())) { yield return new RtfControlWord(controlWord.ToString()); controlWord.Clear(); } break; } controlWord.Append(c); break; case RtfParseState.Text: if (c == '\\') { state = RtfParseState.EscapedText; break; } if (c == '{') { if (!string.IsNullOrWhiteSpace(text.ToString())) { yield return new RtfText(text.ToString()); text.Clear(); } // a new group state = RtfParseState.Group; break; } if (c == '}') { if (!string.IsNullOrWhiteSpace(text.ToString())) { yield return new RtfText(text.ToString()); text.Clear(); } // close group state = stack.Count > 0 ? stack.Pop() : RtfParseState.Group; break; } text.Append(c); break; case RtfParseState.EscapedText: if ((c == '\\') || (c == '}') || (c == '{')) { state = RtfParseState.Text; text.Append(c); break; } // ansi character escape if (c == '\'') { text.Append(FromHexa((char)Reader.Read(), (char)Reader.Read())); break; } if (!string.IsNullOrWhiteSpace(text.ToString())) { yield return new RtfText(text.ToString()); text.Clear(); } // in fact, it's a normal controlWord controlWord.Append(c); state = RtfParseState.ControlWord; break; } } while (true); } public static bool MoveToNextControlWord(IEnumerator enumerator, string word) { if (enumerator == null) throw new ArgumentNullException("enumerator"); while (enumerator.MoveNext()) { if (enumerator.Current.Text == word) return true; } return false; } public static string GetNextText(IEnumerator enumerator) { if (enumerator == null) throw new ArgumentNullException("enumerator"); while (enumerator.MoveNext()) { RtfText text = enumerator.Current as RtfText; if (text != null) return text.Text; } return null; } public static byte[] GetNextTextAsByteArray(IEnumerator enumerator) { if (enumerator == null) throw new ArgumentNullException("enumerator"); while (enumerator.MoveNext()) { RtfText text = enumerator.Current as RtfText; if (text != null) { List bytes = new List(); for (int i = 0; i < text.Text.Length; i += 2) { bytes.Add((byte)FromHexa(text.Text[i], text.Text[i + 1])); } return bytes.ToArray(); } } return null; } // Extracts an EmbeddedObject/ObjectHeader from a stream // see [MS -OLEDS]: Object Linking and Embedding (OLE) Data Structures for more information // chapter 2.2: OLE1.0 Format Structures public static void ExtractObjectData(Stream inputStream, Stream outputStream) { if (inputStream == null) throw new ArgumentNullException("inputStream"); if (outputStream == null) throw new ArgumentNullException("outputStream"); BinaryReader reader = new BinaryReader(inputStream); reader.ReadInt32(); // OLEVersion int formatId = reader.ReadInt32(); // FormatID if (formatId != 2) // see 2.2.4 Object Header. 2 means EmbeddedObject throw new NotSupportedException(); ReadLengthPrefixedAnsiString(reader); // className ReadLengthPrefixedAnsiString(reader); // topicName ReadLengthPrefixedAnsiString(reader); // itemName int nativeDataSize = reader.ReadInt32(); byte[] bytes = reader.ReadBytes(nativeDataSize); outputStream.Write(bytes, 0, bytes.Length); } // see chapter 2.1.4 LengthPrefixedAnsiString private static string ReadLengthPrefixedAnsiString(BinaryReader reader) { int length = reader.ReadInt32(); if (length == 0) return string.Empty; byte[] bytes = reader.ReadBytes(length); return Encoding.Default.GetString(bytes, 0, length - 1); } private enum RtfParseState { ControlWord, Text, EscapedText, Group } private static char FromHexa(char hi, char lo) { return (char)byte.Parse(hi.ToString() + lo, NumberStyles.HexNumber); } } // Utility class to parse an OLE1.0 OLEOBJECT public class PackagedObject { private PackagedObject() { } public string DisplayName { get; private set; } public string IconFilePath { get; private set; } public int IconIndex { get; private set; } public string FilePath { get; private set; } public byte[] Data { get; private set; } private static string ReadAnsiString(BinaryReader reader) { StringBuilder sb = new StringBuilder(); do { byte b = reader.ReadByte(); if (b == 0) return sb.ToString(); sb.Append((char)b); } while (true); } public static PackagedObject Extract(Stream inputStream) { if (inputStream == null) throw new ArgumentNullException("inputStream"); BinaryReader reader = new BinaryReader(inputStream); reader.ReadUInt16(); // sig PackagedObject po = new PackagedObject(); po.DisplayName = ReadAnsiString(reader); po.IconFilePath = ReadAnsiString(reader); po.IconIndex = reader.ReadUInt16(); int type = reader.ReadUInt16(); if (type != 3) // 3 is file, 1 is link throw new NotSupportedException(); reader.ReadInt32(); // nextsize po.FilePath = ReadAnsiString(reader); int dataSize = reader.ReadInt32(); po.Data = reader.ReadBytes(dataSize); // note after that, there may be unicode + long path info return po; } } public class RtfObject { public RtfObject(string text) { if (text == null) throw new ArgumentNullException("text"); Text = text.Trim(); } public string Text { get; private set; } } public class RtfText : RtfObject { public RtfText(string text) : base(text) { } } public class RtfControlWord : RtfObject { public RtfControlWord(string name) : base(name) { } } 

好的,这应该适合你。 为了演示我的解决方案,我创建了一个带有PictureBox的WinForms项目,其绘图事件处理程序映射到以下函数:

  private void rtfImage_Paint(object sender, PaintEventArgs e) { string rtfStr = System.IO.File.ReadAllText("MySampleFile.rtf"); string imageDataHex = ExtractImgHex(rtfStr); byte[] imageBuffer = ToBinary(imageDataHex); Image image; using (MemoryStream stream = new MemoryStream(imageBuffer)) { image = Image.FromStream(stream); } Rectangle rect = new Rectangle(0, 0, 100, 100); e.Graphics.DrawImage(image, rect); } 

此代码依赖于System.Drawing.Image.FromStream()方法以及两个“帮助程序”函数:

字符串提取器:

  string ExtractImgHex(string s) { // I'm sure you could use regex here, but this works. // This assumes one picture per file; loops required otherwise int pictTagIdx = s.IndexOf("{\\pict\\"); int startIndex = s.IndexOf(" ", pictTagIdx)+1; int endIndex = s.IndexOf("}", startIndex); return s.Substring(startIndex, endIndex - startIndex); } 

……和二进制转换器:

  public static byte[] ToBinary(string imageDataHex) { //this function taken entirely from: // http://www.codeproject.com/Articles/27431/Writing-Your-Own-RTF-Converter if (imageDataHex == null) { throw new ArgumentNullException("imageDataHex"); } int hexDigits = imageDataHex.Length; int dataSize = hexDigits / 2; byte[] imageDataBinary = new byte[dataSize]; StringBuilder hex = new StringBuilder(2); int dataPos = 0; for (int i = 0; i < hexDigits; i++) { char c = imageDataHex[i]; if (char.IsWhiteSpace(c)) { continue; } hex.Append(imageDataHex[i]); if (hex.Length == 2) { imageDataBinary[dataPos] = byte.Parse(hex.ToString(), System.Globalization.NumberStyles.HexNumber); dataPos++; hex.Remove(0, 2); } } return imageDataBinary; }