#_________________ program that counts word frequencies in a text file__________________
#5.py


#NOTE: The lecture note pdf file for today will outline steps in an example to solve this problem.
# The program here does the same thing, but it will be easier to understand if you start with the
# explanation in that file, and try to write the program yourself following the steps given.

#When you have done that, it will be easier to understand the program below.


def byFreq(pair):
    return(pair[1])      #takes a pair as input and returns second parameter
 
 
def main():
 
    print("Analyze word frequency in a text file and report the n most frequent words.\n")
 
 
    symbols = '!"#$%&()*+,-./:;<=>?@[\^_]`{|}~' #want to IGNORE all these symbols, if they show up.
 
    #get the words from file and sequence them
 
    fname = input("File to analyze: ")
 
    text = open(fname,"r").read()                      #reads whole file
 
    text = text.lower()                                #convert all to lower case
 
    for ch in symbols:   # for each symbol in our ignore list, replace that symbol with a space
        text = text.replace(ch, " ")               
            #remove symbols, replace by spaces
 
    words = text.split()                               #get list of words
 
 
    # Now make a dictionary of word counts
 
    counts = { }                                       #each word will have a count
                                                       # word is key, count is value 
    for w in words:
 
        counts[w] = counts.get(w,0) + 1  #understand how we are updating counts
                                         #get() returns 0 for new words, and get() returns a
                                         #freq count for prevously seen words

# At this point the dictionary (of word counts) is complete!        
 
 
     # Get the n most frequent words
 
    n = eval(input("For how many words do you want a frequency count?"))
 
    items = list(counts.items())        #items becomes a list of tuples: [("the",25), ("it",10), ....]
 
    items.sort()                        #item list gets sorted in alphabetical order based on word
                                        #because word is the key
    
                                        #this is not what we want !!!
                                        #we want the highest freq word first, and next highest second, etc.
 
    items.sort(key=byFreq, reverse = True)     #sort based on freq (i.e. value), high to low

    #Now print out the sorted items list
 
    for i in range(n):                         #output only top n
 
        word, count = items[i]
 
        print("{0:<15}{1:>5}".format(word, count))
 
if __name__ == "__main__": main()


main()
