两个字符串之间的所有公共子串

我正在使用C#来查找两个字符串之间的所有常见子字符串。 例如,如果输入为:S1 =“需要与电子邮件协商”S2 =“需要电子邮件协助”

输出应该是“需要帮助电子邮件”

下面的代码返回最长的公共子字符串,但我希望我的代码返回所有常见的子字符串。 任何帮助深表感谢!

static void commonsubstrings() { input1 = "need asssitance with email"; input2 = "email assistance needed" if (input2.Length > input1.Length) { swap = input1; input1 = input2; input2 = swap; } int k = 1; String temp; String longTemp = ""; for (int i = 0; (i <= input1.Length); i++) { if ((i == input1.Length)) { if (longest != null) { k = longest.Length + 1; } else { k = 1; } temp = input1.Substring(1, input1.Length - 1); if (temp.Equals("")) { break; } if (k  longTemp.Length)) { longTemp = longest; } } } holder1 = input1.Substring(0, k); for (int j = 0; (j < input2.Length) && (j + k <= input2.Length); j++) { check1 = input2.Substring(j, k); if (holder1.Equals(check1)) { longest = holder1; break; } } k++; } Console.WriteLine(longest); Console.ReadLine(); 

}

  public static string [] CommonString(string left, string right) { List result = new List(); string [] rightArray = right.Split(); string [] leftArray = left.Split(); result.AddRange(rightArray.Where(r => leftArray.Any(l => l.StartsWith(r)))); // must check other way in case left array contains smaller words than right array result.AddRange(leftArray.Where(l => rightArray.Any(r => r.StartsWith(l)))); return result.Distinct().ToArray(); } 

一种不同的方法:你可以使用levenshtein距离来找到两个单词的相似性。 如果距离小于指定值,则可以将两个字符串视为相等。 然后你可以使用levenshtein comparer for Enumerable.Intersect

然后它很简单:

 string S1= "need asssitance with email" ; string S2 = "email assistance needed"; string[] words1 = S1.Split(); string[] words2 = S2.Split(); var wordsIntersecting = words1.Intersect(words2, new LevenshteinComparer()); string output = string.Join(" ", wordsIntersecting); 

输出: need asssitance email

这是自定义比较器:

 class LevenshteinComparer : IEqualityComparer { public int MaxDistance { get; set; } private Levenshtein _Levenshtein = new Levenshtein(); public LevenshteinComparer() : this(50) { } public LevenshteinComparer(int maxDistance) { this.MaxDistance = maxDistance; } public bool Equals(string x, string y) { int distance = _Levenshtein.iLD(x, y); return distance <= MaxDistance; } public int GetHashCode(string obj) { return 0; } } 

这是levenshtein算法的一个实现:

 public class Levenshtein { ///***************************** /// Compute Levenshtein distance /// Memory efficient version ///***************************** public int iLD(String sRow, String sCol) { int RowLen = sRow.Length; // length of sRow int ColLen = sCol.Length; // length of sCol int RowIdx; // iterates through sRow int ColIdx; // iterates through sCol char Row_i; // ith character of sRow char Col_j; // jth character of sCol int cost; // cost /// Test string length if (Math.Max(sRow.Length, sCol.Length) > Math.Pow(2, 31)) throw (new Exception("\nMaximum string length in Levenshtein.iLD is " + Math.Pow(2, 31) + ".\nYours is " + Math.Max(sRow.Length, sCol.Length) + ".")); // Step 1 if (RowLen == 0) { return ColLen; } if (ColLen == 0) { return RowLen; } /// Create the two vectors int[] v0 = new int[RowLen + 1]; int[] v1 = new int[RowLen + 1]; int[] vTmp; /// Step 2 /// Initialize the first vector for (RowIdx = 1; RowIdx <= RowLen; RowIdx++) { v0[RowIdx] = RowIdx; } // Step 3 /// Fore each column for (ColIdx = 1; ColIdx <= ColLen; ColIdx++) { /// Set the 0'th element to the column number v1[0] = ColIdx; Col_j = sCol[ColIdx - 1]; // Step 4 /// Fore each row for (RowIdx = 1; RowIdx <= RowLen; RowIdx++) { Row_i = sRow[RowIdx - 1]; // Step 5 if (Row_i == Col_j) { cost = 0; } else { cost = 1; } // Step 6 /// Find minimum int m_min = v0[RowIdx] + 1; int b = v1[RowIdx - 1] + 1; int c = v0[RowIdx - 1] + cost; if (b < m_min) { m_min = b; } if (c < m_min) { m_min = c; } v1[RowIdx] = m_min; } /// Swap the vectors vTmp = v0; v0 = v1; v1 = vTmp; } // Step 7 /// Value between 0 - 100 /// 0==perfect match 100==totaly different /// /// The vectors where swaped one last time at the end of the last loop, /// that is why the result is now in v0 rather than in v1 //System.Console.WriteLine("iDist=" + v0[RowLen]); int max = System.Math.Max(RowLen, ColLen); return ((100 * v0[RowLen]) / max); } ///***************************** /// Compute the min ///***************************** private int Minimum(int a, int b, int c) { int mi = a; if (b < mi) { mi = b; } if (c < mi) { mi = c; } return mi; } ///***************************** /// Compute Levenshtein distance ///***************************** public int LD(String sNew, String sOld) { int[,] matrix; // matrix int sNewLen = sNew.Length; // length of sNew int sOldLen = sOld.Length; // length of sOld int sNewIdx; // iterates through sNew int sOldIdx; // iterates through sOld char sNew_i; // ith character of sNew char sOld_j; // jth character of sOld int cost; // cost /// Test string length if (Math.Max(sNew.Length, sOld.Length) > Math.Pow(2, 31)) throw (new Exception("\nMaximum string length in Levenshtein.LD is " + Math.Pow(2, 31) + ".\nYours is " + Math.Max(sNew.Length, sOld.Length) + ".")); // Step 1 if (sNewLen == 0) { return sOldLen; } if (sOldLen == 0) { return sNewLen; } matrix = new int[sNewLen + 1, sOldLen + 1]; // Step 2 for (sNewIdx = 0; sNewIdx <= sNewLen; sNewIdx++) { matrix[sNewIdx, 0] = sNewIdx; } for (sOldIdx = 0; sOldIdx <= sOldLen; sOldIdx++) { matrix[0, sOldIdx] = sOldIdx; } // Step 3 for (sNewIdx = 1; sNewIdx <= sNewLen; sNewIdx++) { sNew_i = sNew[sNewIdx - 1]; // Step 4 for (sOldIdx = 1; sOldIdx <= sOldLen; sOldIdx++) { sOld_j = sOld[sOldIdx - 1]; // Step 5 if (sNew_i == sOld_j) { cost = 0; } else { cost = 1; } // Step 6 matrix[sNewIdx, sOldIdx] = Minimum(matrix[sNewIdx - 1, sOldIdx] + 1, matrix[sNewIdx, sOldIdx - 1] + 1, matrix[sNewIdx - 1, sOldIdx - 1] + cost); } } // Step 7 /// Value between 0 - 100 /// 0==perfect match 100==totaly different System.Console.WriteLine("Dist=" + matrix[sNewLen, sOldLen]); int max = System.Math.Max(sNewLen, sOldLen); return (100 * matrix[sNewLen, sOldLen]) / max; } } 

对Levenshtein类的信任: http : //www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm

使用Set-Intersections

从例程开始查找字符串的所有可能子字符串。 这是在Python中,它是“为读者练习”将其转换为C#’:

 def allSubstr(instring): retset = set() retset.add(instring) totlen = len(instring) for thislen in range(0, totlen): for startpos in range(0, totlen): # print "startpos: %s, thislen: %s" % (startpos, thislen) addStr = instring[startpos:startpos+thislen] # print "addstr: %s" % (addStr) retset.add(addStr) print "retset total: %s" % (retset) return retset set1 = allSubstr('abcdefg') set2 = allSubstr('cdef') print set1.intersection(set2) 

这是输出:

 >>> set1 = allSubstr('abcdefg') retset total: set(['', 'cde', 'ab', 'ef', 'cd', 'abcdef', 'abc', 'efg', 'bcde', 'cdefg', 'bc', 'de', 'bcdef', 'abcd', 'defg', 'fg', 'cdef', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'bcd', 'abcde', 'abcdefg', 'bcdefg', 'def']) >>> set2 = allSubstr('cdef') retset total: set(['', 'cde', 'c', 'ef', 'e', 'd', 'f', 'de', 'cd', 'cdef', 'def']) >>> >>> set1.intersection(set2) set(['', 'cde', 'c', 'de', 'e', 'd', 'f', 'ef', 'cd', 'cdef', 'def']) 

不,您对长度为1的子集不感兴趣。但是,在执行set.add()调用之前,您始终可以添加长度限制。