# Inverted Index (Version #3)
# H. Conrad Cunningham
# 19 September 2006

# This Inverted Index program builds an inverted index data structure for
# files containing English text.  The names of the files are given
# successively in the argument to calls of the method index_file.  The words
# may appear appear zero or more per line with white space and punctuation.

# A word is any contiguous sequence of word characters, that is, alphabetic
# or numeric characters or the underscore character, separated from its
# surrounding words by white space or punctuation.  Hyphens, apostrophes,
# and periods embedded within the word are included in the word, but those
# characters at the beginning or end of a word are not included.  Other
# white space or punctuation characters are considered as items between
# words.

# The lookup method enables a client to find a word in the inverted index
# and retrieve the corresponding filename/linenumber occurrence list.

# The print_index method enables a client to print the index to the standard
# output stream.

# INTERNAL DESIGN

# The inverted index is implemented as a Ruby hash whose keys are the
# English words (as String data) and whose values are nested arrays
# referring to the filenames and the line numbers at which the key appeared
# in the files. 

# The hash has the following structure:
#
# ["word1" => [ ["file1", [1,8,10]], ["file2", [7], ...],
#  "word2" => [ ["file2", [2,2,3]], ...
#    ...
#  ]
#
# This means that "word1" can be found in "file1" on lines 1, 8, and 10 and
# in "file2" at line 7, and so forth.

# Note: This program makes extensive use of blocks and iterators.  The
# purpose for its construction was, in part, to help the author learn how to
# use these features of Ruby (that are somewhat familiar to him from his
# Haskell functional programming experience.)

# Current issues:
#     - error handling and robustness should be improved
#     - handling of stop words should be investigated further
#     - testing should be improved

class InvertedIndex

  # Regular expression to match English words
  @@wp   = /(\w+([-'.]\w+)*)/ 

  # Stop word set.  To ignore word, set it anything other than nil, false,
  # or 0.  It is better to use true.

  DEFAULT_STOPS = {"the" => true, "a" => true, "an" => true}


  # Create an empty hash as the initial value for the inverted index.  The
  # optional argument is an array of stop words to ignore in building the
  # inverted index.

  def initialize(*args)
    @files_indexed = []
    @index = Hash.new
    @stops = Hash.new
    if args.size == 1
      args[0].each {|w| @stops[w] = true}
    else
      @stops = DEFAULT_STOPS
    end
  end


  # Method index_file adds the words from file "filename" to the inverted
  # index contained in this object if not already indexed.

  def index_file(filename)

    # Exit if input file has already been indexed.

    unless @files_indexed.index(filename) == nil
      STDERR.puts(
        "In InvertedFile#index_file, file #{filename} already indexed.")
      return
    end

    # Exit if input file does not exist or is unreadable.

    unless File.exist? filename
      STDERR.puts(
        "In InvertedFile#index_file, file #{filename} does not exist.")
      return
    end
    unless File.readable? filename
      STDERR. puts(
        "In InvertedFile#index_file, file #{filename} is not readable.")
      return
    end

    # Add filename at end of the list of files indexed.

    @files_indexed.push(filename)

    # Open the input file of English text
    inf = File.new(filename)

    # Initialize the line counter
    lineno = 0

    # The File#each iterator applies the block to each line of the file.
    # Its block is executed for its side affects, not for any value
    # returned.

    inf.each do |s|  # s is a String giving a line of text from the input file 

      # Increment the line counter
      lineno += 1 

      # Isolate the words as array entries in lowercase.

      # Below, the String#scan iterator matches all occurrences of its
      # argument regular expression groups in the receiver string and
      # returns an array of arrays.  The only value of interest in an inner
      # array is the first one, which is the full word matched by the
      # regular expression.

      # Below, the Enumerable#map iterator (seen here as Array#map) applies
      # the block to each element of the receiver structure (here an array)
      # and returns the resulting array.  It does not modify the structure
      # to which it is applied, but creates a new structure.

      # Note: Enumerable#map is an alias for Enumerable#collect.

      # Below, the String#downcase method returns a copy of its receiver
      # string with all characters converted to lowercase.

      words = s.scan(@@wp).map {|a| a[0].downcase} 

      # Remove words that are stop words.

      # Below, the Enumerable#reject iterator (called here as Array#reject)
      # removes words that are in the stop word set.  The stop word set is
      # represented by hash @stops.  A word is to be ignored if it maps to
      # anything other than nil, false, or 0.

      words = words.reject {|w| @stops[w]}

      # Attach the filename and line number to each word to generate a
      # filename/linenumber pair.  The result is nested array with the
      # structure [ ["word1", ["file1", [linenumber]]], ...].  The
      # linenumbers are in arrays because the index may have previously been
      # built for previous files.
   
      words = words.map {|w| [w,[filename,[lineno]]]}

      # Insert the filename/linenumber occurrence lists into the hash with
      # the words as the keys.  The values are nested arrays with a
      # structure such as [ ["file1", [1]], ["file1", [3]], ...]

      # Below, the Array#each iterator applies the block to each element of
      # the receiver array.  It is executed for its side affects, not for
      # any values returned by the call.

      # Below, the Array#push method appends a new entry at the end of an
      # array.

      words.each do |p|  # p has structure [ word, [file,[line]] ]
        @index[p[0]] = [] unless @index.has_key? p[0] # initialize entry
        @index[p[0]] = @index[p[0]].push(p[1])
      end

    end

    # Close the input file.
    inf.close

    # For each key in the hash (i.e., each unique word we found in the
    # files), sort the filename/linenumber occurrence lists (arrays) into
    # ascending order by filename.

    # Below, the Hash#each iterator applies the block to each key-value pair
    # from the hash.  It passes these as the k and v arguments of the block,
    # respectively.  It is executed for its side affects, not for any values
    # returned by the call.

    # Below, the Enumerable#sort iterator (used here as Array#sort) sorts
    # the array using the block to make the comparison.

    @index.each do |k,v|  # k => v is one key/value entry in the hash
      @index[k] =  v.sort {|a,b| a[0] <=> b[0]}
    end

    # For each key in the hash (i.e., each unique word we found in this or
    # previous files), combine all the linenumber occurrence lists (i.e.,
    # arrays) for a filename into a single list of linenumbers.  That is, it
    # folds lists of the form 
    #     [ [file1,[line1]], [file1, [line2]], ... [file2, [line3]] ...]  
    # into the more compact form 
    #     [ [file1,[line1,line2]], [file2, [line3,...]] ...].  
    # It assumes that the filename/linenumber occurrence lists are in
    # ascending order.

    # Below, the Array#slice operator returns a contiguous segment of the
    # array denoted by its range argument.  The usage here returns a copy of
    # the array with its first element removed.

    # Below, the Enumerable#inject iterator (called as Array#inject) applies
    # the block to the previous value of the accumulator (parameter acc) and
    # each element's value (parameter v) to produce a new value of the
    # accumulator.  The initial value of the accumulator is the argument of
    # the call to inject. The block must return the new value of the
    # accumulator.

    # The block associated with the Enumerable#inject call does the work of
    # combining the list as described above.  If the current element's
    # filename is the same as the last one in the accumulator, the block
    # appends the new list of linenumbers to the last entry in the
    # accumulator.  If the filename is different from the previous one, then
    # the block appends a new filename/linenumber entry to the accumulator.
    # Note that this assumes that the list is sorted by filename.

    @index.each do |k,v|
      @index[k] = v.slice(1...v.length).inject([v[0]]) do |acc, e|
                    if acc[-1][0] == e[0]
                      acc[-1][1] = acc[-1][1] + e[1]
                    else
                      acc = acc + [e]
                    end
                    acc  # must return value of accumulator!
                  end
    end  # of @index.each's block
    self # allow chaining of calls by returning reference to this object
  end # of method index_file


  # Method lookup returns the corresponding filename/linenumber occurrence
  # list for the word if it is in the inverted index.  Otherwise, the method
  # returns nil.  The filename/linenumber occurrence has the structure 
  # [ ["file1", [2,2,3]], ["file2", [1]], ...] where the filename entries are
  # are in ascending alphabetical order by filename and the linenumber lists
  # are in ascending numerical order.

  # Note: This method hides the actual data structure used for the inverted
  # index inside the class.

  def lookup(word)

    # This returns a deep copy of the filename/linenumber occurence list
    # from the hash.  A simple clone of the hash entry would enable the
    # internal structure of the inverted index to be changed from outside
    # the class.  This copies the items three levels deep.  It clones the
    # filename String but it does not clone the Fixnum entries in the
    # linenumber list.

    if @index[word]
      @index[word].map {|f| [f[0].clone, f[1].clone] } 
    else 
      nil 
    end 
  end


  # Method print_index prints the inverted index with the words in
  # alphabetical order.  The format of the output is:
  #
  #     word1
  #         file1:  j1,j2,...,jn
  #         file2:  k1,k2,...,km
  #         ...
  #     word2
  #         ...
    
  def print_index
    ordarr = @index.sort
    ordarr.each do |w|
      puts "#{w[0]}"
      w[1].each do |f|
        l = (f[1].map {|n| n.to_s}).inject {|acc,e| "#{acc}, #{e}" }
        puts "    #{f[0]}: #{l}"
      end
    end
    puts "Stop word set:  #{@stops.keys.join(", ")}"
    self # allow chaining of calls by returning reference to this object
  end  
end

# Class to do some testing of InvertedIndex.  It needs the presence of
# text files "test1" and "test2" in the default directory.

class TestII

  def initialize
  end

  def TestII.do_test
    inv = InvertedIndex.new

    inv.index_file("test1")
    puts "\nIndex after inserting file test1"
    inv.print_index

    inv.index_file("test2")
    puts "\nIndex after inserting file test2"
    inv.print_index

    inv.index_file("TestNonexistentFile")

    x = inv.lookup("one")
    x = if !x then "???" 
        else x.map {|f| f[0] + ": " + f[1].join(", ") + "; "} 
        end
    puts "Lookup \"one\" -- #{x}"
    x = inv.lookup("test-file")
    x = if !x then "???" 
        else x.map {|f| f[0] + ": " + f[1].join(", ") + "; "} 
        end
    puts "Lookup \"test-file\" -- #{x}"
    x = inv.lookup("nothere")
    x = if !x then "???" 
        else x.map {|f| f[0] + ": " + f[1].join(", ") + "; "} 
        end
    puts "Lookup \"notthere\" -- #{x}"
    x = inv.lookup("the")
    x = if !x then "???" 
        else x.map {|f| f[0] + ": " + f[1].join(", ") + "; "} 
        end
    puts "Lookup \"the\" -- #{x}"
    puts "\nDONE!"
  end

end