Discovering identical content or similar content is done using an algorithm called Finding Document Distance. Using this algorithm, applications for finding copyright infringements, detecting duplicate content and similar content can be effectively created.
You can find explanations of this algorithm by searching on Google so this post will not go into the details of the algorithm.
Given below is a C# implementation with sample text files. You are free to use or adapt the code for your own needs . Any pointers, queries and constructive criticism is welcome.
// Find Document Distance: Given two documents, how similar are they // // Copyright (C) 2015 Amit Sengupta, amit@truelogic.org // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see <http://www.gnu.org/licenses/>. // .......................................................................... // // // Reference: http://courses.csail.mit.edu/6.006/spring11/lectures/lec01.pdf // Algorithm: // 1.Read each document // 2.Split each document into words. A valid word is at least 3 characters long. // 3.Count word frequencies (document vectors) for each document // 4.Compute dot product for doc 1 and 2 // 5.Compute dot product for doc 1 and 1 // 6.Compute dot product for doc 2 and 2 // 7.Compute document distance . 0=identical, 1.57= completely different // using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; using System.IO; using System.Text.RegularExpressions; using System.Reflection; using System.Collections; namespace DiffDocument { class Program { static void Main(string[] args) { string appPath = Assembly.GetExecutingAssembly().Location; appPath = appPath.Substring(0, appPath.LastIndexOf("\\")); // file 1 and 2 Console.WriteLine("file1.txt and file2.txt are identical"); string file1 = appPath + "\\file1.txt"; string file2 = appPath + "\\file2.txt"; findDistance(file1, file2); // file 1 and 3 Console.WriteLine("file1.txt and file3.txt are same but file3 has some lines removed"); file1 = appPath + "\\file1.txt"; string file3 = appPath + "\\file3.txt"; findDistance(file1, file3); // file 1 and 4 Console.WriteLine("file1.txt and file4.txt are same but file4 has extra content added on the same topic"); file1 = appPath + "\\file1.txt"; string file4 = appPath + "\\file4.txt"; findDistance(file1, file4); // file 1 and 5 Console.WriteLine("file1.txt and file5.txt are same but file5 has content of file1 changed by an article spinner app"); file1 = appPath + "\\file1.txt"; string file5 = appPath + "\\file5.txt"; findDistance(file1, file5); // file 1 and 6 Console.WriteLine("file1.txt and file6.txt are same but file6 has content of file1 changed by a second article spinner app"); file1 = appPath + "\\file1.txt"; string file6 = appPath + "\\file6.txt"; findDistance(file1, file6); } /// <summary> /// Find document distance between two files /// </summary> /// <param name="file1">file path of first file</param> /// <param name="file2">file path of second file</param> public static void findDistance(string file1, string file2) { // handle file 1 string data1 = readData(file1); Dictionary<string, int> dictWordCount1 = processFile(data1); // handle file 2 string data2 = readData(file2); Dictionary<string, int> dictWordCount2= processFile(data2); // compute inner product for 1 and 2 int innerProductForBoth = getInnerProduct(dictWordCount1, dictWordCount2); Console.WriteLine("Inner product for 1 and 2 = " + innerProductForBoth.ToString()); // compute inner product for 1 and 1 int innerProductFor1 = getInnerProduct(dictWordCount1, dictWordCount1); Console.WriteLine("Inner product for 1 and 1 = " + innerProductFor1.ToString()); // compute inner product for 2 and 2 int innerProductFor2 = getInnerProduct(dictWordCount2, dictWordCount2); Console.WriteLine("Inner product for 2 and 2 = " + innerProductFor2.ToString()); // compute document distance int numerator = innerProductForBoth; double denominator = Math.Sqrt(innerProductFor1 * innerProductFor2); double distance = Math.Acos(numerator / denominator); Console.WriteLine("Document Distance = " + distance.ToString()); double percent = 100 - (distance * 100) / 1.57; Console.WriteLine("Probability of documents being the same = " + percent.ToString("#.##") + "%\n\n\n"); } /// <summary> /// Read contents of a text file /// </summary> /// <param name="path">full file path</param> /// <returns>text - data of file</returns> public static string readData(string path) { string text = ""; try { text = File.ReadAllText(path); } catch (IOException iex) { Console.WriteLine("Error reading " + path + ". " + iex.Message); } return text; } /// <summary> /// Parse file data into words , count their frequencies and return array of wordcount /// </summary> /// <param name="data">file data</param> /// <returns>dictwordCount - dictionary containing word counts</returns> public static Dictionary<string, int> processFile(string data) { Dictionary<string, int> dictWordCount = new Dictionary<string, int>(); // first convert all non-alphabet and non-number characters into spaces. Regex regex = new Regex(@"[\W_]+"); string result = regex.Replace(data, " "); // parse words from text var words = Regex.Matches(result, @"\w{3,}", RegexOptions.IgnoreCase).OfType<Match>().Select(m => m.Groups[0].Value).ToArray(); // get word count for each word in array dictWordCount = new Dictionary<string, int>(); foreach (string word in words) { if (dictWordCount.ContainsKey(word)){ dictWordCount[word] += 1; } else{ dictWordCount.Add(word, 1); } } return dictWordCount; } /// <summary> /// Process two word count arrays and compute dot product /// </summary> /// <param name="dict1">dictionary of word counts</param> /// <param name="dict2">dictionary of word counts</param> /// <returns>finalTotal - dot product</returns> public static int getInnerProduct(Dictionary<string, int> dict1, Dictionary<string, int> dict2) { int count1 = 0, count2 = 0, total, finalTotal = 0; foreach (KeyValuePair<string, int> pair in dict1) { string word = pair.Key; count1 = pair.Value; if (dict2.ContainsKey(word)) { count2 = dict2[word]; total = count1 * count2; finalTotal += total; } } return finalTotal; } } }
The output of the code is shown below:
Leave a Reply