/**
 * Copyright 2007 The Apache Software Foundation, All Rights Reserved,
 * Copyright 2008 Ethan Blanton.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License.  You may
 * obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.  See the License for the specific language governing
 * permissions and limitations under the License.
 *
 * @see http://hadoop.apache.org/core/docs/r0.18.0/mapred_tutorial.html
 */

package edu.purdue.cs.eblanton;

import edu.purdue.cs.eblanton.*;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.conf.*;

/**
 * Main execution class for a Hadoop map/reduce operation which counts
 * occurrences of every unique whitespace-separated word in the files
 * from a specified input directory, and writes this output to a
 * specified output directory.
 *
 * @author The Apache Software Foundation
 * @author Ethan Blanton
 */
public class WordCount {
    /**
     * Performs the WordCount map/reduce operation.
     *
     * @param args      the command arguments.  <code>argv[0]</code>
     *                  should hold the name of a directory containing
     *                  text files for input, and <code>argv[1]</code>
     *                  should hold the name of a non-existent
     *                  directory to be created for output.
     */                        
    public static void main(String[] args) throws Exception {
        /*
         * Every Hadoop job requires a JobConf, which tells the
         * framework what map and reduce operations are to be performed.
         */
        JobConf conf = new JobConf(WordCount.class);
        conf.setJobName("wordcount");

        /*
         * The ultimate output of our job will be key, value pairs
         * consisting of a word and an integer count of the number of
         * times it occurred in the input directory.
         */
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(IntWritable.class);

        /*
         * We will perform a WordMap mapping, followed by a
         * CounterReduce reduction.  In this case, the combine and
         * reduce operations are the same (a simple sum).
         */
        conf.setMapperClass(WordMap.class);
        conf.setCombinerClass(CounterReduce.class);
        conf.setReducerClass(CounterReduce.class);

        /*
         * The TextInputFormat class feeds one line of text at a time to
         * our WordMap mapper.  The TextOutputFormat class outputs its
         * key and value fields as simple whitespace-separated text in
         * files named part-0000 through part-nnnn.
         */
        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        /*
         * argv[0] contains an input directory from which to read files
         * to word count, and argv[1] contains a non-existent output
         * directory to be created and contain the part-nnnn sum files.
         */
        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));

        JobClient.runJob(conf);
    }
}