在大文件中搜索和替换正则表达式而不会出现OutOfMemoryException

我创建了一个小代码,用于搜索正则表达式字符串并将其替换为其他字符串,然后创建一个包含所做更改的新输出文件。 代码似乎适用于较小的文件,但对于100 MB或更大的文件,我给出了System.OutOfMemoryException错误。

这是我的代码:

string foldername = Path.Combine( Environment.GetFolderPath(Environment.SpecialFolder.Desktop), String.Format("FIXED_{0}.tmx", Path.GetFileNameWithoutExtension(textBox1.Text))); string text = File.ReadAllText(textBox1.Text); text = Regex.Replace(text, @"]*>", "", RegexOptions.Multiline); text = Regex.Replace(text, @" ", "", RegexOptions.Multiline); File.WriteAllText(foldername, text); 

Visual Studio突出显示string text = File.ReadAllText(textBox1.Text); 部分。 我认为也许使用File.ReadAllLines可以更好地工作,但我无法使其与正则表达式一起工作。

有人可以帮我吗? 我是C#的新手,可能我的代码不是最好的。

我担心你必须自己实施更换。 以下是使用状态机将*]>替换为的示例代码。 唯一的问题是,如果文件以结束,那么只有将被写入输出。

 enum TruncateSegState { Idle, TagStart, TagStartS, TagStartSE, TagStartSEG, TagSEG } static void TruncateSeg(StreamReader input, StreamWriter output) { TruncateSegState state = TruncateSegState.Idle; while (!input.EndOfStream) { char ch = (char)input.Read(); switch (state) { case TruncateSegState.Idle: if (ch == '<') state = TruncateSegState.TagStart; output.Write(ch); break; case TruncateSegState.TagStart: if (ch == 's') state = TruncateSegState.TagStartS; else state = TruncateSegState.Idle; output.Write(ch); break; case TruncateSegState.TagStartS: if (ch == 'e') state = TruncateSegState.TagStartSE; else state = TruncateSegState.Idle; output.Write(ch); break; case TruncateSegState.TagStartSE: if (ch == 'g') state = TruncateSegState.TagStartSEG; else state = TruncateSegState.Idle; output.Write(ch); break; case TruncateSegState.TagStartSEG: if (char.IsWhiteSpace(ch)) state = TruncateSegState.TagSEG; else { state = TruncateSegState.Idle; output.Write(ch); } break; case TruncateSegState.TagSEG: if (ch == '>') { state = TruncateSegState.Idle; output.Write(ch); } break; } } } 

用法:

 using (StreamReader reader = new StreamReader("input.txt")) using (StreamWriter writer = new StreamWriter("temp.txt")) TruncateSeg(reader, writer); 

生成temp.txt ,将其用作下一个方法的输入,这会添加缺少的标记。

 enum ReplaceSegTuvState { Idle, InsideSEG } static void ReplaceSegTuv(StreamReader input, StreamWriter output) { ReplaceSegTuvState state = ReplaceSegTuvState.Idle; StringBuilder segBuffer = new StringBuilder(); while (!input.EndOfStream) { char ch = (char)input.Read(); switch (state) { case ReplaceSegTuvState.Idle: if (ch == '<') { char[] buffer = new char[4]; int bufferActualLength = input.ReadBlock(buffer, 0, buffer.Length); output.Write('<'); output.Write(buffer, 0, bufferActualLength); if (bufferActualLength == buffer.Length && "seg>".SequenceEqual(buffer)) { segBuffer.Clear(); state = ReplaceSegTuvState.InsideSEG; } } else output.Write(ch); break; case ReplaceSegTuvState.InsideSEG: if (ch == '<') { char[] buffer = new char[5]; int bufferActualLength = input.ReadBlock(buffer, 0, buffer.Length); if (bufferActualLength == buffer.Length && "/tuv>".SequenceEqual(buffer)) { output.Write(""); output.Write(""); state = ReplaceSegTuvState.Idle; } else { output.Write(segBuffer.ToString()); output.Write('<'); output.Write(buffer, 0, bufferActualLength); state = ReplaceSegTuvState.Idle; } } else if (!char.IsWhiteSpace(ch)) { output.Write(segBuffer.ToString()); output.Write(ch); state = ReplaceSegTuvState.Idle; } else segBuffer.Append(ch); break; } } } 

用法:

 using (StreamReader reader = new StreamReader("temp.txt")) using (StreamWriter writer = new StreamWriter("output.txt")) ReplaceSegTuv(reader, writer); 

尝试查看内存管理文件; MSDN: http : //msdn.microsoft.com/en-us/library/dd997372.aspx

它们似乎是处理非常大的文件的推荐方法。