It was a quiet day on Friday and I had a monster 90+ design document to review. Dry does not even begin to describe the document, so I decided to leave it for Monday and do something a little bit more intellectually stimulating. I went through my list of 
to visit bookmarks and I stumbled upon the 
Free Online courses from MIT, so after going through what was available, I settled on a CS course on algorithms, 
Introduction to Algorithms. 
I've created a small console app that can compare two files, all within a directory and files within two directories. I first implemented the definition of word used by the course:
A word is a consecutive sequence of alphanumeric characters,  such as "Hamlet" or "2007".  We'll treat all upper-case letters as if  they are lower-case, so that "Hamlet" and "hamlet" are the same word.   Words end at a non-alphanumeric character, so "can't" contains two words: "can" and "t". 
However, I was not too satisfied with this definition, so I defined a word as:
A word is a sequence of alphanumeric characters that may contain an apostrophe,  such as "Hamlet" or "can't" surrounded by a space or a punctuation mark. We'll treat all upper-case letters as if  they are lower-case, so that "Hamlet" and "hamlet" are the same word. 
Source Code:
   1 using System;
   2 using System.Collections.Generic;
   3 using System.Linq;
   4 using System.Text;
   5 using System.IO;
   6 
   7 namespace DocumentDistance
   8 {
   9     class Program
  10     {
  11         static void Main(string[] args)
  12         {
  13              switch (args.Length)
  14              {
  15                  case 3:
  16                      if (args[0].ToLower().Equals("-l") || args[0].ToLower().Equals("-c"))
  17                      {
  18                          Process(args);
  19                      }
  20                      else
  21                      {
  22                          Usage();
  23                      }
  24                      break;
  25                  case 4:
  26                      if (args[1].ToLower().Equals("-dd") && (args[0].ToLower().Equals("-l") || args[0].ToLower().Equals("-c")))
  27                      {
  28                          Process(args);
  29                      }
  30                      else
  31                      {
  32                          Usage();
  33                      }
  34                      break;
  35                  default: Usage();
  36                      break;
  37              }
  38        
  39         }
  40 
  41         <summary>
  42         to invoke all processing methods
  43         </summary>
  44         <param name="args"></param>
  45         private static void Process(string[] args)
  46         {
  47             try
  48             {
  49                 
  50                 List<string>[] words;
  51 
  52                 DirectoryInfo di;
  53                 FileInfo[] files, filesdir1, filesdir2;
  54 
  55                 if (args[1].ToLower().Equals("-d"))
  56                 {
  57                     
  58                     di = new DirectoryInfo(args[2]);
  59                     files = di.GetFiles("*.txt");
  60                 }
  61                 else if (args[1].ToLower().Equals("-dd"))
  62                 {
  63                     di = new DirectoryInfo(args[2]);
  64                     filesdir1 = di.GetFiles("*.txt");
  65                     di = new DirectoryInfo(args[3]);
  66                     filesdir2 = di.GetFiles("*.txt");
  67 
  68                     files = new FileInfo[filesdir1.Length + filesdir2.Length - 1];
  69 
  70                     for (int i = 0; i < filesdir1.Length; i++)
  71                     {
  72                         files[i] = filesdir1[i];
  73                     }
  74 
  75                     for (int i = 0, j = filesdir1.Length - 1; i < filesdir2.Length; j++, i++)
  76                     {
  77                         files[j] = filesdir2[i];
  78                     }
  79                 }
  80                 else
  81                 {
  82                     files = new FileInfo[2];
  83                     files[0] = new FileInfo(args[1]);
  84                     files[1] = new FileInfo(args[2]);
  85                 }
  86 
  87 
  88                 if (args[0].ToLower().Equals("-l"))
  89                 {
  90                     
  91                     words = ReadFilesLine(files);
  92                 }
  93                 else
  94                 {
  95                     
  96                     words = ReadFilesChar(files);
  97                 }
  98                 
  99                 Dictionary<string, int>[] dicts = new Dictionary<string, int>[words.Length];
 100 
 101                 
 102                 for (int i = 0; i < words.Length; i++)
 103                 {
 104                     dicts[i] = CountWords(words[i], files[i].Name);
 105                 }
 106 
 107                 
 108                 for (int i = 0; i < words.Length - 1; i++)
 109                 {
 110                     for (int j = i + 1; j < words.Length; j++)
 111                     {
 112                         CalculateDistance(files[i].Name, files[j].Name, dicts[i], dicts[j]);
 113                     }
 114                 }
 115 
 116                 Console.ReadLine();
 117             }
 118             catch (Exception ex)
 119             {
 120                 Console.WriteLine("Exception: {0}", ex);
 121                 Console.ReadLine();
 122             }
 123         }
 124 
 125         <summary>
 126         
 127         </summary>
 128         private static void Usage()
 129         {
 130             Console.WriteLine("Usage is DD.exe <ProcessSwitch> txtfile1 txtfile2 or DD.exe <ProcessSwitch> -d Directory Path");
 131             Console.WriteLine("or DD.exe <ProcessSwitch> -dd DirectoryPath1 DirectoryPath2");
 132             Console.WriteLine("where <ProcessSwitch> is either -c or -l");
 133             Console.WriteLine("-c : words are defined as any alphanumeric string so that 'can't' is two words (can and t) but cant is one");
 134             Console.Write("-l : words are defined as any alphanumeric string with a space at either end, apart from words ");
 135             Console.Write("ending in an apostrophe e.g. computers' to indicate possesion");
 136             Console.ReadLine();
 137         }
 138 
 139         <summary>
 140         
 141         </summary>
 142         <param name="file1"></param>
 143         <param name="file2"></param>
 144         <param name="dict1"></param>
 145         <param name="dict2"></param>
 146         private static void CalculateDistance(string file1, string file2, Dictionary<string, int> dict1, Dictionary<string, int> dict2)
 147         {
 148             long numerator = InnerProduct(dict1, dict2);
 149             double denominator = Math.Sqrt(InnerProduct(dict1, dict1) * InnerProduct(dict2, dict2));
 150 
 151             
 152             double theta = Math.Acos(numerator / denominator);
 153 
 154             
 155             double percentage = ((theta / Math.PI * 2) - 1) * -100;
 156 
 157             Console.WriteLine("The distance between {0} and {1} is:{2:F6} or {3:F3}% similarity.", file1, file2, theta, percentage);
 158         }
 159 
 160         <summary>
 161         
 162         
 163         </summary>
 164         <param name="filename"></param>
 165         <returns></returns>
 166         private static List<string> ReadFile(string filename)
 167         {
 168             try
 169             {
 170                 StreamReader file = new StreamReader(filename);
 171 
 172                 string word = string.Empty;
 173 
 174                 List<string> words = new List<string>();
 175 
 176                 char n;
 177 
 178                 do
 179                 {
 180                     n = (char)file.Read();
 181 
 182                     if (char.IsLetterOrDigit(n))
 183                     {
 184                         word += n;
 185                     }
 186                     else
 187                     {
 188                         if (!string.IsNullOrEmpty(word))
 189                         {
 190                             words.Add(word.ToLower());
 191                         }
 192 
 193                         word = null;
 194                     }
 195 
 196                 } while (n != (char)65535);
 197 
 198                 return words;
 199             }
 200             catch (Exception ex)
 201             {
 202 
 203                 throw new Exception("Exception While Reading file", ex.InnerException);
 204             }
 205         }
 206 
 207         <summary>
 208         
 209         
 210         
 211         
 212         
 213         </summary>
 214         <param name="files"></param>
 215         <returns></returns>
 216         private static List<string>[] ReadFilesLine(FileInfo[] files)
 217         {
 218             List<string>[] words;
 219             FileInfo fi;
 220             string word = string.Empty;
 221             string[] tempwords;
 222 
 223             try
 224             {
 225                 words = new List<string>[files.Length];
 226 
 227                 for (int i = 0; i < files.Length; i++)
 228                 {
 229                     
 230                     words[i] = new List<string>();
 231 
 232                     fi = files[i];
 233 
 234                     StreamReader file = new StreamReader(fi.FullName);
 235 
 236                     string line = file.ReadLine();
 237 
 238                     do
 239                     {
 240                         word = string.Empty;
 241 
 242                         
 243                         tempwords = line.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
 244 
 245                         foreach (string s in tempwords)
 246                         {
 247                             word = CheckAlphaNumeric(s);
 248 
 249                             if (!word.Equals(string.Empty))
 250                             {
 251                                 words[i].Add(word);
 252                             }
 253                         }
 254 
 255                         line = file.ReadLine();
 256 
 257                     } while (line != null);
 258                 }
 259 
 260                 return words;
 261             }
 262             catch (Exception ex)
 263             {
 264                 throw new Exception("Exception While Reading file", ex.InnerException);
 265             }
 266         }
 267 
 268         <summary>
 269         
 270         </summary>
 271         <param name="files"></param>
 272         <returns></returns>
 273         private static List<string>[] ReadFilesChar(FileInfo[] files)
 274         {
 275             List<string>[] words = new List<string>[files.Length];
 276 
 277             try
 278             {
 279                 for (int i = 0; i < files.Length; i++)
 280                 {
 281                     words[i] = new List<string>();
 282                     words[i] = ReadFile(files[i].FullName);
 283                 }
 284 
 285                 return words;
 286             }
 287             catch (Exception ex)
 288             {
 289                 throw new Exception("Exception While Reading file", ex.InnerException);
 290             }
 291         }
 292 
 293         <summary>
 294         
 295         
 296         
 297         
 298         </summary>
 299         <param name="s"></param>
 300         <returns></returns>
 301         private static string CheckAlphaNumeric(string s)
 302         {
 303             int start = 0, end = 0;
 304 
 305             
 306             
 307             for (int j = 0; j < s.Length; j++)
 308             {
 309                 if (char.IsLetterOrDigit(s[j]))
 310                 {
 311                     start = j;
 312                     break;
 313                 }
 314             }
 315 
 316             for (int j = s.Length - 1; j > 0; j--)
 317             {
 318                 
 319                 if (char.IsLetterOrDigit(s[j]) || s[j] == (char)39)
 320                 {
 321                     end = j;
 322                     break;
 323                 }
 324             }
 325 
 326             if (end != s.Length - 1 || start != 0)
 327             {
 328                 
 329                 
 330                 if (start == end)
 331                 {
 332                     return string.Empty;
 333                 }
 334                 else
 335                 {
 336                     return s.Substring(start, (end - start) + 1).ToLower();
 337                 }
 338             }
 339             else
 340             {
 341                 return s.ToLower();
 342             }
 343         }
 344 
 345         <summary>
 346         
 347         
 348         
 349         
 350         </summary>
 351         <param name="dict1"></param>
 352         <param name="dict2"></param>
 353         <returns></returns>
 354         private static long InnerProduct(Dictionary<string, int> dict1, Dictionary<string, int> dict2)
 355         {
 356             long sum = 0;
 357             
 358             foreach (var pair in dict1)
 359             {
 360                 if (dict2.ContainsKey(pair.Key))
 361                 {
 362                     sum += dict1[pair.Key] * dict2[pair.Key];
 363                 }
 364             }
 365 
 366             return sum;
 367         }
 368 
 369         <summary>
 370         
 371         <string,int>
 372         </summary>
 373         <param name="words"></param>
 374         <param name="filename"></param>
 375         <returns></returns>
 376         private static Dictionary<string, int> CountWords(List<string> words, string filename)
 377         {
 378             Dictionary<string, int> dict = new Dictionary<string, int>();
 379 
 380             foreach (string s in words)
 381             {
 382                 if (!dict.ContainsKey(s.ToLower()))
 383                 {
 384                     dict.Add(s.ToLower(), 1);
 385                 }
 386                 else
 387                 {
 388                     dict[s.ToLower()] = dict[s.ToLower()] + 1;
 389                 }
 390             }
 391 
 392             Console.WriteLine("{0}: Total Words {1}, Distinct Words {2}", filename.Remove(filename.LastIndexOf('.')), words.Count, dict.Count);
 393 
 394             return dict;
 395         }
 396 
 397     }
 398 }
Let's see whether there are significant differences between both word definitions:
dd -c t1.verne.txt t2.bobsey.txt
t1.verne: Total Words 8943, Distinct Words 2150
t2.bobsey: Total Words 49785, Distinct Words 3354
The distance between t1.verne.txt and t2.bobsey.txt is:0.582949 or 62.888% similarity.
dd -l t1.verne.txt t2.bobsey.txt
t1.verne: Total Words 8629, Distinct Words 2189
t2.bobsey: Total Words 47554, Distinct Words 3655
The distance between t1.verne.txt and t2.bobsey.txt is:0.538879 or 65.694% similarity.
There is a bit of a change but not massive.  An another comparison:
dd -c t2.bobsey.txt t3.lewis.txt
t2.bobsey: Total Words 49785, Distinct Words 3354
t3.lewis: Total Words 182355, Distinct Words 8530
The distance between t2.bobsey.txt and t3.lewis.txt is:0.574160 or 63.448% similarity.
dd -l t2.bobsey.txt t3.lewis.txt 
t2.bobsey: Total Words 47554, Distinct Words 3655
t3.lewis: Total Words 180645, Distinct Words 9022
The distance between t2.bobsey.txt and t3.lewis.txt is:0.521558 or 66.797% similarity.
Again the same sort of variation between the word definitions, from now on I'll stick to my definition of a word. The other crucial point is that the similarity is quite high between the texts, ~65%. How about if we test two novels from the same author, say 
Jane Austen? 
Emma: Total Words 160028, Distinct Words 10450
Pride and Prejudice: Total Words 124224, Distinct Words 7181
The distance between Emma.txt and Pride and Prejudice.txt is:0.183119 or 88.342% similarity.
In fact the lowest similarity between a selected few of Jane Austen's novels is ~ 85% and between a few of Arthur Conan Doyle's novels is ~ 87%. So it would be reasonable to suppose an ~85% similarity between novels by the same author or would it?
It turns out that it's not quite as simple as that, see below a comparison between several of Dickens' novel:
A Tale of Two Cities: Total Words 138321, Distinct Words 11202
David Coperfield: Total Words 356158, Distinct Words 20083
Great Expectations: Total Words 186598, Distinct Words 12711
Hard Times: Total Words 105454, Distinct Words 10543
Oliver Twist: Total Words 160480, Distinct Words 13350
The Life And Adventures Of Nicholas Nickleby: Total Words 324043, Distinct Words 20720
The Pickwick Papers: Total Words 300922, Distinct Words 21384
The distance between A Tale of Two Cities.txt and David Coperfield.txt is:0.378243 or 75.920% similarity.
The distance between A Tale of Two Cities.txt and Great Expectations.txt is:0.314171 or 79.999% similarity.
The distance between A Tale of Two Cities.txt and Hard Times.txt is:0.256404 or 83.677% similarity.
The distance between A Tale of Two Cities.txt and Oliver Twist.txt is:0.169488 or 89.210% similarity.
The distance between A Tale of Two Cities.txt and The Life And Adventures Of Nicholas Nickleby.txt is:0.186920 or 88.100% similarity.
The distance between A Tale of Two Cities.txt and The Pickwick Papers.txt is:0.257100 or 83.633% similarity.
The distance between David Coperfield.txt and Great Expectations.txt is:0.153898 or 90.203% similarity.
The distance between David Coperfield.txt and Hard Times.txt is:0.268132 or 82.930% similarity.
The distance between David Coperfield.txt and Oliver Twist.txt is:0.436028 or 72.242% similarity.
The distance between David Coperfield.txt and The Life And Adventures Of Nicholas Nickleby.txt is:0.342514 or 78.195% similarity.
The distance between David Coperfield.txt and The Pickwick Papers.txt is:0.466544 or 70.299% similarity.
The distance between Great Expectations.txt and Hard Times.txt is:0.252974 or 83.895% similarity.
The distance between Great Expectations.txt and Oliver Twist.txt is:0.375174 or 76.116% similarity.
The distance between Great Expectations.txt and The Life And Adventures Of Nicholas Nickleby.txt is:0.296311 or 81.136% similarity.
The distance between Great Expectations.txt and The Pickwick Papers.txt is:0.425808 or 72.892% similarity.
The distance between Hard Times.txt and Oliver Twist.txt is:0.306853 or 80.465% similarity.
The distance between Hard Times.txt and The Life And Adventures Of Nicholas Nickleby.txt is:0.218641 or 86.081% similarity.
The distance between Hard Times.txt and The Pickwick Papers.txt is:0.364912 or 76.769% similarity.
The distance between Oliver Twist.txt and The Life And Adventures Of Nicholas Nickleby.txt is:0.203269 or 87.059% similarity.
The distance between Oliver Twist.txt and The Pickwick Papers.txt is:0.201226 or 87.190% similarity.
The distance between The Life And Adventures Of Nicholas Nickleby.txt and The Pickwick Papers.txt is:0.272799 or 82.633% similarity.
There are three comparisons that yield less than 73% and another three that yield ~76%, so perhaps we need to recalibrate how similar two documents need to be before we can claim that they have been written by the same author. Three more comparisons: 
Emma: Total Words 160028, Distinct Words 10450
David Coperfield: Total Words 356158, Distinct Words 20083
The distance between Emma.txt and David Coperfield.txt is:0.350631 or 77.678% similarity. 
Emma: Total Words 160028, Distinct Words 10450
Hard Times: Total Words 105454, Distinct Words 10543
The distance between Emma.txt and Hard Times.txt is:0.295020 or 81.218% similarity. 
Great Expectations: Total Words 186598, Distinct Words 12711
A Study in Scarlet: Total Words 46660, Distinct Words 6418
The distance between Great Expectations.txt and A Study in Scarlet.txt is:0.297589 or 81.055% similarity. 
Oh Dear!, they've all been written by the same literary master, but by whom?
These comparisons clearly show the limitations of this type of, very simple, word frequency analysis.  You weren't really expecting to know whether Sir Francis Bacon wrote Shakespeare's plays with a 400 line application were you?
dd -l t8.shakespeare.txt t9.bacon.txt
t8.shakespeare: Total Words 898261, Distinct Words 30241
t9.bacon: Total Words 55612, Distinct Words 7943
The distance between t8.shakespeare.txt and t9.bacon.txt is:0.530085 or 66.254% similarity.