It was a quiet day on Friday and I had a monster 90+ design document to review. Dry does not even begin to describe the document, so I decided to leave it for Monday and do something a little bit more intellectually stimulating. I went through my list of
to visit bookmarks and I stumbled upon the
Free Online courses from MIT, so after going through what was available, I settled on a CS course on algorithms,
Introduction to Algorithms.
I've created a small console app that can compare two files, all within a directory and files within two directories. I first implemented the definition of word used by the course:
A word is a consecutive sequence of alphanumeric characters, such as "Hamlet" or "2007". We'll treat all upper-case letters as if they are lower-case, so that "Hamlet" and "hamlet" are the same word. Words end at a non-alphanumeric character, so "can't" contains two words: "can" and "t".
However, I was not too satisfied with this definition, so I defined a word as:
A word is a sequence of alphanumeric characters that may contain an apostrophe, such as "Hamlet" or "can't" surrounded by a space or a punctuation mark. We'll treat all upper-case letters as if they are lower-case, so that "Hamlet" and "hamlet" are the same word.
Source Code:
1 using System;
2 using System.Collections.Generic;
3 using System.Linq;
4 using System.Text;
5 using System.IO;
6
7 namespace DocumentDistance
8 {
9 class Program
10 {
11 static void Main(string[] args)
12 {
13 switch (args.Length)
14 {
15 case 3:
16 if (args[0].ToLower().Equals("-l") || args[0].ToLower().Equals("-c"))
17 {
18 Process(args);
19 }
20 else
21 {
22 Usage();
23 }
24 break;
25 case 4:
26 if (args[1].ToLower().Equals("-dd") && (args[0].ToLower().Equals("-l") || args[0].ToLower().Equals("-c")))
27 {
28 Process(args);
29 }
30 else
31 {
32 Usage();
33 }
34 break;
35 default: Usage();
36 break;
37 }
38
39 }
40
41 <summary>
42 to invoke all processing methods
43 </summary>
44 <param name="args"></param>
45 private static void Process(string[] args)
46 {
47 try
48 {
49
50 List<string>[] words;
51
52 DirectoryInfo di;
53 FileInfo[] files, filesdir1, filesdir2;
54
55 if (args[1].ToLower().Equals("-d"))
56 {
57
58 di = new DirectoryInfo(args[2]);
59 files = di.GetFiles("*.txt");
60 }
61 else if (args[1].ToLower().Equals("-dd"))
62 {
63 di = new DirectoryInfo(args[2]);
64 filesdir1 = di.GetFiles("*.txt");
65 di = new DirectoryInfo(args[3]);
66 filesdir2 = di.GetFiles("*.txt");
67
68 files = new FileInfo[filesdir1.Length + filesdir2.Length - 1];
69
70 for (int i = 0; i < filesdir1.Length; i++)
71 {
72 files[i] = filesdir1[i];
73 }
74
75 for (int i = 0, j = filesdir1.Length - 1; i < filesdir2.Length; j++, i++)
76 {
77 files[j] = filesdir2[i];
78 }
79 }
80 else
81 {
82 files = new FileInfo[2];
83 files[0] = new FileInfo(args[1]);
84 files[1] = new FileInfo(args[2]);
85 }
86
87
88 if (args[0].ToLower().Equals("-l"))
89 {
90
91 words = ReadFilesLine(files);
92 }
93 else
94 {
95
96 words = ReadFilesChar(files);
97 }
98
99 Dictionary<string, int>[] dicts = new Dictionary<string, int>[words.Length];
100
101
102 for (int i = 0; i < words.Length; i++)
103 {
104 dicts[i] = CountWords(words[i], files[i].Name);
105 }
106
107
108 for (int i = 0; i < words.Length - 1; i++)
109 {
110 for (int j = i + 1; j < words.Length; j++)
111 {
112 CalculateDistance(files[i].Name, files[j].Name, dicts[i], dicts[j]);
113 }
114 }
115
116 Console.ReadLine();
117 }
118 catch (Exception ex)
119 {
120 Console.WriteLine("Exception: {0}", ex);
121 Console.ReadLine();
122 }
123 }
124
125 <summary>
126
127 </summary>
128 private static void Usage()
129 {
130 Console.WriteLine("Usage is DD.exe <ProcessSwitch> txtfile1 txtfile2 or DD.exe <ProcessSwitch> -d Directory Path");
131 Console.WriteLine("or DD.exe <ProcessSwitch> -dd DirectoryPath1 DirectoryPath2");
132 Console.WriteLine("where <ProcessSwitch> is either -c or -l");
133 Console.WriteLine("-c : words are defined as any alphanumeric string so that 'can't' is two words (can and t) but cant is one");
134 Console.Write("-l : words are defined as any alphanumeric string with a space at either end, apart from words ");
135 Console.Write("ending in an apostrophe e.g. computers' to indicate possesion");
136 Console.ReadLine();
137 }
138
139 <summary>
140
141 </summary>
142 <param name="file1"></param>
143 <param name="file2"></param>
144 <param name="dict1"></param>
145 <param name="dict2"></param>
146 private static void CalculateDistance(string file1, string file2, Dictionary<string, int> dict1, Dictionary<string, int> dict2)
147 {
148 long numerator = InnerProduct(dict1, dict2);
149 double denominator = Math.Sqrt(InnerProduct(dict1, dict1) * InnerProduct(dict2, dict2));
150
151
152 double theta = Math.Acos(numerator / denominator);
153
154
155 double percentage = ((theta / Math.PI * 2) - 1) * -100;
156
157 Console.WriteLine("The distance between {0} and {1} is:{2:F6} or {3:F3}% similarity.", file1, file2, theta, percentage);
158 }
159
160 <summary>
161
162
163 </summary>
164 <param name="filename"></param>
165 <returns></returns>
166 private static List<string> ReadFile(string filename)
167 {
168 try
169 {
170 StreamReader file = new StreamReader(filename);
171
172 string word = string.Empty;
173
174 List<string> words = new List<string>();
175
176 char n;
177
178 do
179 {
180 n = (char)file.Read();
181
182 if (char.IsLetterOrDigit(n))
183 {
184 word += n;
185 }
186 else
187 {
188 if (!string.IsNullOrEmpty(word))
189 {
190 words.Add(word.ToLower());
191 }
192
193 word = null;
194 }
195
196 } while (n != (char)65535);
197
198 return words;
199 }
200 catch (Exception ex)
201 {
202
203 throw new Exception("Exception While Reading file", ex.InnerException);
204 }
205 }
206
207 <summary>
208
209
210
211
212
213 </summary>
214 <param name="files"></param>
215 <returns></returns>
216 private static List<string>[] ReadFilesLine(FileInfo[] files)
217 {
218 List<string>[] words;
219 FileInfo fi;
220 string word = string.Empty;
221 string[] tempwords;
222
223 try
224 {
225 words = new List<string>[files.Length];
226
227 for (int i = 0; i < files.Length; i++)
228 {
229
230 words[i] = new List<string>();
231
232 fi = files[i];
233
234 StreamReader file = new StreamReader(fi.FullName);
235
236 string line = file.ReadLine();
237
238 do
239 {
240 word = string.Empty;
241
242
243 tempwords = line.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
244
245 foreach (string s in tempwords)
246 {
247 word = CheckAlphaNumeric(s);
248
249 if (!word.Equals(string.Empty))
250 {
251 words[i].Add(word);
252 }
253 }
254
255 line = file.ReadLine();
256
257 } while (line != null);
258 }
259
260 return words;
261 }
262 catch (Exception ex)
263 {
264 throw new Exception("Exception While Reading file", ex.InnerException);
265 }
266 }
267
268 <summary>
269
270 </summary>
271 <param name="files"></param>
272 <returns></returns>
273 private static List<string>[] ReadFilesChar(FileInfo[] files)
274 {
275 List<string>[] words = new List<string>[files.Length];
276
277 try
278 {
279 for (int i = 0; i < files.Length; i++)
280 {
281 words[i] = new List<string>();
282 words[i] = ReadFile(files[i].FullName);
283 }
284
285 return words;
286 }
287 catch (Exception ex)
288 {
289 throw new Exception("Exception While Reading file", ex.InnerException);
290 }
291 }
292
293 <summary>
294
295
296
297
298 </summary>
299 <param name="s"></param>
300 <returns></returns>
301 private static string CheckAlphaNumeric(string s)
302 {
303 int start = 0, end = 0;
304
305
306
307 for (int j = 0; j < s.Length; j++)
308 {
309 if (char.IsLetterOrDigit(s[j]))
310 {
311 start = j;
312 break;
313 }
314 }
315
316 for (int j = s.Length - 1; j > 0; j--)
317 {
318
319 if (char.IsLetterOrDigit(s[j]) || s[j] == (char)39)
320 {
321 end = j;
322 break;
323 }
324 }
325
326 if (end != s.Length - 1 || start != 0)
327 {
328
329
330 if (start == end)
331 {
332 return string.Empty;
333 }
334 else
335 {
336 return s.Substring(start, (end - start) + 1).ToLower();
337 }
338 }
339 else
340 {
341 return s.ToLower();
342 }
343 }
344
345 <summary>
346
347
348
349
350 </summary>
351 <param name="dict1"></param>
352 <param name="dict2"></param>
353 <returns></returns>
354 private static long InnerProduct(Dictionary<string, int> dict1, Dictionary<string, int> dict2)
355 {
356 long sum = 0;
357
358 foreach (var pair in dict1)
359 {
360 if (dict2.ContainsKey(pair.Key))
361 {
362 sum += dict1[pair.Key] * dict2[pair.Key];
363 }
364 }
365
366 return sum;
367 }
368
369 <summary>
370
371 <string,int>
372 </summary>
373 <param name="words"></param>
374 <param name="filename"></param>
375 <returns></returns>
376 private static Dictionary<string, int> CountWords(List<string> words, string filename)
377 {
378 Dictionary<string, int> dict = new Dictionary<string, int>();
379
380 foreach (string s in words)
381 {
382 if (!dict.ContainsKey(s.ToLower()))
383 {
384 dict.Add(s.ToLower(), 1);
385 }
386 else
387 {
388 dict[s.ToLower()] = dict[s.ToLower()] + 1;
389 }
390 }
391
392 Console.WriteLine("{0}: Total Words {1}, Distinct Words {2}", filename.Remove(filename.LastIndexOf('.')), words.Count, dict.Count);
393
394 return dict;
395 }
396
397 }
398 }
Let's see whether there are significant differences between both word definitions:
dd -c t1.verne.txt t2.bobsey.txt
t1.verne: Total Words 8943, Distinct Words 2150
t2.bobsey: Total Words 49785, Distinct Words 3354
The distance between t1.verne.txt and t2.bobsey.txt is:0.582949 or 62.888% similarity.
dd -l t1.verne.txt t2.bobsey.txt
t1.verne: Total Words 8629, Distinct Words 2189
t2.bobsey: Total Words 47554, Distinct Words 3655
The distance between t1.verne.txt and t2.bobsey.txt is:0.538879 or 65.694% similarity.
There is a bit of a change but not massive. An another comparison:
dd -c t2.bobsey.txt t3.lewis.txt
t2.bobsey: Total Words 49785, Distinct Words 3354
t3.lewis: Total Words 182355, Distinct Words 8530
The distance between t2.bobsey.txt and t3.lewis.txt is:0.574160 or 63.448% similarity.
dd -l t2.bobsey.txt t3.lewis.txt
t2.bobsey: Total Words 47554, Distinct Words 3655
t3.lewis: Total Words 180645, Distinct Words 9022
The distance between t2.bobsey.txt and t3.lewis.txt is:0.521558 or 66.797% similarity.
Again the same sort of variation between the word definitions, from now on I'll stick to my definition of a word. The other crucial point is that the similarity is quite high between the texts, ~65%. How about if we test two novels from the same author, say
Jane Austen?
Emma: Total Words 160028, Distinct Words 10450
Pride and Prejudice: Total Words 124224, Distinct Words 7181
The distance between Emma.txt and Pride and Prejudice.txt is:0.183119 or 88.342% similarity.
In fact the lowest similarity between a selected few of Jane Austen's novels is ~ 85% and between a few of Arthur Conan Doyle's novels is ~ 87%. So it would be reasonable to suppose an ~85% similarity between novels by the same author or would it?
It turns out that it's not quite as simple as that, see below a comparison between several of Dickens' novel:
A Tale of Two Cities: Total Words 138321, Distinct Words 11202
David Coperfield: Total Words 356158, Distinct Words 20083
Great Expectations: Total Words 186598, Distinct Words 12711
Hard Times: Total Words 105454, Distinct Words 10543
Oliver Twist: Total Words 160480, Distinct Words 13350
The Life And Adventures Of Nicholas Nickleby: Total Words 324043, Distinct Words 20720
The Pickwick Papers: Total Words 300922, Distinct Words 21384
The distance between A Tale of Two Cities.txt and David Coperfield.txt is:0.378243 or 75.920% similarity.
The distance between A Tale of Two Cities.txt and Great Expectations.txt is:0.314171 or 79.999% similarity.
The distance between A Tale of Two Cities.txt and Hard Times.txt is:0.256404 or 83.677% similarity.
The distance between A Tale of Two Cities.txt and Oliver Twist.txt is:0.169488 or 89.210% similarity.
The distance between A Tale of Two Cities.txt and The Life And Adventures Of Nicholas Nickleby.txt is:0.186920 or 88.100% similarity.
The distance between A Tale of Two Cities.txt and The Pickwick Papers.txt is:0.257100 or 83.633% similarity.
The distance between David Coperfield.txt and Great Expectations.txt is:0.153898 or 90.203% similarity.
The distance between David Coperfield.txt and Hard Times.txt is:0.268132 or 82.930% similarity.
The distance between David Coperfield.txt and Oliver Twist.txt is:0.436028 or 72.242% similarity.
The distance between David Coperfield.txt and The Life And Adventures Of Nicholas Nickleby.txt is:0.342514 or 78.195% similarity.
The distance between David Coperfield.txt and The Pickwick Papers.txt is:0.466544 or 70.299% similarity.
The distance between Great Expectations.txt and Hard Times.txt is:0.252974 or 83.895% similarity.
The distance between Great Expectations.txt and Oliver Twist.txt is:0.375174 or 76.116% similarity.
The distance between Great Expectations.txt and The Life And Adventures Of Nicholas Nickleby.txt is:0.296311 or 81.136% similarity.
The distance between Great Expectations.txt and The Pickwick Papers.txt is:0.425808 or 72.892% similarity.
The distance between Hard Times.txt and Oliver Twist.txt is:0.306853 or 80.465% similarity.
The distance between Hard Times.txt and The Life And Adventures Of Nicholas Nickleby.txt is:0.218641 or 86.081% similarity.
The distance between Hard Times.txt and The Pickwick Papers.txt is:0.364912 or 76.769% similarity.
The distance between Oliver Twist.txt and The Life And Adventures Of Nicholas Nickleby.txt is:0.203269 or 87.059% similarity.
The distance between Oliver Twist.txt and The Pickwick Papers.txt is:0.201226 or 87.190% similarity.
The distance between The Life And Adventures Of Nicholas Nickleby.txt and The Pickwick Papers.txt is:0.272799 or 82.633% similarity.
There are three comparisons that yield less than 73% and another three that yield ~76%, so perhaps we need to recalibrate how similar two documents need to be before we can claim that they have been written by the same author. Three more comparisons:
Emma: Total Words 160028, Distinct Words 10450
David Coperfield: Total Words 356158, Distinct Words 20083
The distance between Emma.txt and David Coperfield.txt is:0.350631 or 77.678% similarity.
Emma: Total Words 160028, Distinct Words 10450
Hard Times: Total Words 105454, Distinct Words 10543
The distance between Emma.txt and Hard Times.txt is:0.295020 or 81.218% similarity.
Great Expectations: Total Words 186598, Distinct Words 12711
A Study in Scarlet: Total Words 46660, Distinct Words 6418
The distance between Great Expectations.txt and A Study in Scarlet.txt is:0.297589 or 81.055% similarity.
Oh Dear!, they've all been written by the same literary master, but by whom?
These comparisons clearly show the limitations of this type of, very simple, word frequency analysis. You weren't really expecting to know whether Sir Francis Bacon wrote Shakespeare's plays with a 400 line application were you?
dd -l t8.shakespeare.txt t9.bacon.txt
t8.shakespeare: Total Words 898261, Distinct Words 30241
t9.bacon: Total Words 55612, Distinct Words 7943
The distance between t8.shakespeare.txt and t9.bacon.txt is:0.530085 or 66.254% similarity.