Flesch-Kincaid可读性测试

是否有任何开源.Net库可以处理Flesch-Kincaid可读性计算?

维基: http : //en.wikipedia.org/wiki/Flesch-Kincaid_readability_test

不是开源的,但您可以使用ReadabilityStatistic接口委托给Word。 即使您的文档不在Word中,也可以打开Word(对用户不可见),将文本转储到Word中,然后使用ReadabilityStatistic计算统计信息。

如Flesch-Kincaid等级公式所述:

https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests

你需要计算单词,句子和音节。 音节也许是最棘手的,尽管句子也需要一些思考。

以下是其他人的音节计数代码的两个翻译到F#(这是.NET,你可以在visual studio中创建一个F#项目,然后从你的C#项目中引用该项目)。 我已经完成了基本但没有广泛的测试。

我发现Ipeirotis在我的一些测试用例(一旦我添加问题单词列表)上比Child有更好的结果。 我的测试词是:

 let testWords = [|"abalone";"gracious";"atheism";"unaware"; "seaside";"underwater";"wonderwoman";"biology"|] 

子代码特别是列表末尾有问题。 将正则表达式从最长的词缀重新排序为最短词似乎不能解决它。

我的翻译:

 module Readability open System.Text.RegularExpressions //for syllables //simpler: //https://github.com/ipeirotis/ReadabilityMetrics/blob/master/src/main/java/com/ipeirotis/readability/engine/Syllabify.java let SyllableCount2 (word:string) = let SubSyl = [| "cial"; "tia"; "cius"; "cious"; "giu"; "ion"; "iou"; "sia$"; ".ely$" |] let AddSyl = [| "ia"; "riet"; "dien"; "iu"; "io"; "ii"; "[aeiouym]bl$"; "[aeiou]{3}"; "^mc"; "ism$"; "[^aeiouy][^aeiouy]l$"; "[^l]lien"; "^coa[dglx]."; "[^gq]ua[^auieo]"; "dnt$" |] let mutable tempWord = word.ToLower() tempWord <- tempWord.Replace("'", " ") if problemWordMap.ContainsKey( word ) then problemWordMap.[word] else if tempWord = "i" || tempWord = "a" then 1 else if tempWord.EndsWith("e") then tempWord <- tempWord.Substring(0, tempWord.Length - 1) let phonems = Regex.Split(tempWord, "[^aeiouy]+") let mutable syl = 0; for i = 0 to SubSyl.Length - 1 do let syllabe = SubSyl.[i]; if Regex.IsMatch( tempWord, syllabe) then syl <- syl - 1 for i = 0 to AddSyl.Length - 1 do let syllabe = AddSyl.[i]; if Regex.IsMatch( tempWord, syllabe) then syl <- syl + 1 if tempWord.Length = 1 then syl <- syl + 1 for i = 0 to phonems.Length - 1 do if phonems.[i].Length > 0 then syl <- syl + 1 if syl = 0 then syl <- 1 // return syl //https://github.com/DaveChild/Text-Statistics/blob/master/src/DaveChild/TextStatistics/Syllables.php let problemWordMap = dict[ ("abalone", 4); ("abare", 3); ("abed" , 2); ("abruzzese", 4); ("abbruzzese" , 4); ("aborigine", 5); ("aborigines", 5); //andrew plural (ap) ("acreage", 3); ("acreage", 3); //ap ("adame", 3); ("adieu", 2); ("adobe", 3); ("anemone", 4); ("anemones", 4); //ap ("apache" , 3); ("apaches" , 3); //ap ("aphrodite", 4); ("apostrophe" , 4); ("apostrophes" , 4); //ap ("ariadne", 4); ("cafe" , 2); ("cafes" , 2); //ap ("calliope" , 4); ("catastrophe", 4); ("catastrophes", 4); //ap ("chile", 2); ("chiles", 2); //ap ("chloe", 2); ("circe", 2); ("coyote" , 3); ("coyotes" , 3); //ap ("epitome", 4); ("forever", 3); ("gethsemane" , 4); ("guacamole", 4); ("guacamoles", 4); //ap ("hyperbole", 4); ("hyperboles", 4); //ap ("jesse", 2); ("jukebox", 2); ("jukeboxes", 2); //ap ("karate" , 3); ("karates" , 3); //ap ("machete", 3); ("maybe", 2); ("people" , 2); ("recipe" , 3); ("sesame" , 3); ("shoreline", 2); ("simile" , 3); ("machetes", 3); //ap ("maybes", 2);//ap ("peoples" , 2);//ap ("recipes" , 3);//ap ("sesames" , 3);//ap ("shorelines", 2);//ap ("similes" , 3);//ap ("syncope", 3); ("tamale" , 3); ("tamales" , 3); //ap ("yosemite" , 4); ("daphne" , 2); ("eurydice" , 4); ("euterpe", 3); ("hermione" , 4); ("penelope" , 4); ("persephone" , 4); ("phoebe" , 2); ("zoe", 2); ] // These syllables would be counted as two but should be one let oneSyllableCorrection = [| "cia(l|$)"; // glacial, acacia "tia"; "cius"; "cious"; "[^aeiou]giu"; "[aeiouy][^aeiouy]ion"; "iou"; "sia$"; "eous$"; "[oa]gue$"; ".[^aeiuoycgltdb]{2,}ed$"; ".ely$"; //"[cg]h?ed?$"; //"rved?$"; //"[aeiouy][dt]es?$"; //"^[dr]e[aeiou][^aeiou]+$"; // Sorts out deal, deign etc //"[aeiouy]rse$"; // Purse, hearse "^jua"; //"nne[ds]?$"; // canadienne "uai"; // acquainted "eau"; // champeau //"pagne[ds]?$"; // champagne //"[aeiouy][^aeiuoytdbcgrnzs]h?e[rsd]?$"; // The following detects words ending with a soft e ending. Don";t // mess with it unless you absolutely have to! The following // is a list of words you can use to test a new version of // this rule (add ";r";, ";s"; and ";d"; where possible to test // fully): // - absolve // - acquiesce // - audience // - ache // - acquire // - brunelle // - byrne // - canadienne // - coughed // - curved // - champagne // - designate // - force // - lace // - late // - lathe // - make // - relayed // - scrounge // - side // - sideline // - some // - wide // - taste "[aeiouy](b|c|ch|d|dg|f|g|gh|gn|k|l|ll|lv|m|mm|n|nc|ng|nn|p|r|rc|rn|rs|rv|s|sc|sk|sl|squ|ss|st|t|th|v|y|z)e$"; // For soft e endings with a "d". Test words: // - crunched // - forced // - hated // - sided // - sidelined // - unexploded // - unexplored // - scrounged // - squelched // - forced "[aeiouy](b|c|ch|dg|f|g|gh|gn|k|l|lch|ll|lv|m|mm|n|nc|ng|nch|nn|p|r|rc|rn|rs|rv|s|sc|sk|sl|squ|ss|th|v|y|z)ed$"; // For soft e endings with a "s". Test words: // - absences // - accomplices // - acknowledges // - advantages // - byrnes // - crunches // - forces // - scrounges // - squelches "[aeiouy](b|ch|d|f|gh|gn|k|l|lch|ll|lv|m|mm|n|nch|nn|p|r|rn|rs|rv|s|sc|sk|sl|squ|ss|st|t|th|v|y)es$"; "^busi$"; |] |> String.concat("|") |> Regex // These syllables would be counted as one but should be two let twoSyllableCorrection = [| "([^s]|^)ia"; "riet"; "dien"; // audience "iu"; "io"; "eo($|[b-df-hj-np-tv-z])"; "ii"; "[ou]a$"; "[aeiouym]bl$"; "[aeiou]{3}"; "[aeiou]y[aeiou]"; "^mc"; "ism$"; "asm$"; "thm$"; "([^aeiouy])\1l$"; "[^l]lien"; "^coa[dglx]."; "[^gq]ua[^auieo]"; "dnt$"; "uity$"; "[^aeiouy]ie(r|st|t)$"; "eings?$"; "[aeiouy]sh?e[rsd]$"; "iell"; "dea$"; "real"; // real, cereal "[^aeiou]y[ae]"; // bryan, byerley "gean$"; // aegean "uen"; // influence, affluence |] |> String.concat("|") |> Regex // Single syllable prefixes and suffixes let oneSyllableAffix = [| "^un"; "^fore"; "^ware"; "^none?"; "^out"; "^post"; "^sub"; "^pre"; "^pro"; "^dis"; "^side"; "ly$"; "less$"; "some$"; "ful$"; "ers?$"; "ness$"; "cians?$"; "ments?$"; "ettes?$"; "villes?$"; "ships?$"; "sides?$"; "ports?$"; "shires?$"; "tion(ed)?$"; |] |> String.concat("|") |> Regex // Double syllable prefixes and suffixes let twoSyllableAffix = [| "^above"; "^ant[ie]"; "^counter"; "^hyper"; "^afore"; "^agri"; "^in[ft]ra"; "^inter"; "^over"; "^semi"; "^ultra"; "^under"; "^extra"; "^dia"; "^micro"; "^mega"; "^kilo"; "^pico"; "^nano"; "^macro"; "berry$"; "woman$"; "women$"; |] |> String.concat("|") |> Regex // Triple syllable prefixes and suffixes let threeSyllableAffix = [| "ology$"; "ologist$"; "onomy$"; "onomist$"; |] |> String.concat("|") |> Regex ///  /// For each match in pattern, replace match with empty string in input word, /// returning bare word and # matches ///  ///  ///  let RegexReplace (regex:Regex) word = //let affixReplace = new Regex( pattern ) let matches = regex.Matches(word) let mutable bareWord = word for aMatch in matches do bareWord <- bareWord.Replace(aMatch.Value,"") // bareWord, matches.Count //need to exclude a group? let CountMatches (regex:Regex) word = //let regex = new Regex( pattern ) let matches = regex.Matches(word) // matches.Count ///  /// Counts syllables in word. Assumes word has already been "cleaned" ///  ///  let SyllableCount( word : string) = if problemWordMap.ContainsKey( word ) then problemWordMap.[word] else //remove and count affixes let wordMinus1Affix, oneAffixCount = RegexReplace oneSyllableAffix word let wordMinus2Affix, twoAffixCount = RegexReplace twoSyllableAffix wordMinus1Affix let wordMinus3Affix, threeAffixCount = RegexReplace threeSyllableAffix wordMinus2Affix //count word parts let vowelSplit = Regex.Split(wordMinus3Affix, "[^aeiouy]") let mutable wordPartCount = 0 for wordPart in vowelSplit do if wordPart.Length > 0 then wordPartCount <- wordPartCount + 1 //base syllable count let mutable baseSyllableCount = oneAffixCount + twoAffixCount + threeAffixCount + wordPartCount //handle degenerate cases let oneSyllableCorrectionCount = CountMatches oneSyllableCorrection word //count two as one: subtract let twoSyllableCorrectionCount = CountMatches twoSyllableCorrection word //count one as two: add baseSyllableCount <- baseSyllableCount - oneSyllableCorrectionCount + twoSyllableCorrectionCount //we always have 1 syllable in a word if baseSyllableCount > 0 then baseSyllableCount else 1 

为了处理句子计数,我使用nuget包作为Stanford解析器并创建了这个包装器:

 using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; using edu.stanford.nlp.process; using edu.stanford.nlp.util; namespace StanfordWrapper { public class SentenceTokenizer { public static readonly TokenizerFactory TokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "normalizeParentheses=false,normalizeOtherBrackets=false,invertible=true"); public static List Go( string input ) { java.io.Reader reader = new java.io.StringReader(input); DocumentPreprocessor dp = new DocumentPreprocessor(reader); dp.setTokenizerFactory(TokenizerFactory); List output = new List(); foreach (java.util.List sentence in dp) { output.Add(StringUtils.joinWithOriginalWhiteSpace(sentence)); } return output; } } } 

包装器很有用b / c解析器在java中。 nuget使用IKVMC使其可以通过.NET调用。

最后,对于字数统计,我使用一些代码来清理/标记:

 module TextNormalizer open System; open System.Collections.Generic; open System.Linq; open System.Text.RegularExpressions; let spaceRegex = new Regex(@"\s+"); let normalizeTextRegexStrict = new Regex( String.Join("|", [| @"[^\w\s]"; @"[0-9]+"; "_" |]), RegexOptions.Compiled); let normalizeTextRegexApostrophe = new Regex( String.Join("|", [| @"[^'\w\s]"; @"[0-9]+"; "_" |]), RegexOptions.Compiled); ///  /// Replaces all punctuation with whitspace, apostrophe optional. Will return string matching original text with punctuation /// removed, text lowercased, and words evenly delimited with whitespace ///  ///  ///  let Normalize( normedLine ) ( removeApostrophe ) = let normedLine = if removeApostrophe then normalizeTextRegexStrict.Replace(normedLine, " "); // replace all punctuation with whitespace else normalizeTextRegexApostrophe.Replace(normedLine, " "); // replace all except apostrophe with whitespace //return spaceRegex.Replace( normedLine, " " ) // reduce continguous whitespace to a single space .Trim() // get rid of any whitespace on ends .ToLower(); // lowercase whole thing 

有了所有这些东西,计算FK是微不足道的:

 let FleshKincaidGradeLevel( text ) = let sentences = StanfordWrapper.SentenceTokenizer.Go( text ) |> Seq.toArray let words = sentences |> Array.map( fun x -> TextNormalizer.Normalize x false ) |> Array.collect( fun x -> x.Split( ' ' )) let syllableCount = words |> Array.map SyllableCount2 |> Array.sum //FKGL formula: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests ( 0.39 * ( float words.Length) / (float sentences.Length ) ) + ( 11.8 * (float syllableCount ) / ( float words.Length) ) - 15.59 

我很惊讶没有这个库,但你真的需要它吗?

如果你能得到你的原始文本,那么计算就相当简单了。

查看这个 (PHP)计数音节的源代码就像计算句子一样,使用正则表达式,而不是分裂。!? 分裂所有的元音aeiouy。

Java中有一个开源解决方案 – 它不是.Net,但它是相对清晰的代码,你可能可以翻译: https : //github.com/ipeirotis/ReadabilityMetrics (在Java中),它依次基于http: //search.cpan.org/author/GREGFAST/Lingua-EN-Syllable-0.251/ (在Perl中)。