'''
Created on Nov 26, 2011

@author: Ryan Rossi (rrossi@purdue.edu)
'''

import urllib
import gzip
import os
#################################################
# Download Wikipedia pagecounts and decompress
#################################################
print 'Downloading Wikipedia Hourly Pageviews...'

url = 'http://dumps.wikimedia.org/other/pagecounts-raw/2009/2009-03/'
prefix = 'pagecounts-200903'
day = ['03','04','05','06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19','20','21','22','23','24','25','26','27','28','29','30']
hour = ['000000', '010000', '020000', '030000', '040000', '050000', '060000', '070000', '080000', '090000', '100000', '110000', '120000', '130000', '140000', '150000', '160000', '170000', '180000', '190000', '200000', '210000', '220000', '230000']
hour_fix = ['000001', '010001', '020001', '030001', '040001', '050001', '060001', '070001', '080001', '090001', '100001', '110001', '120001', '130001', '140001', '150001', '160001', '170001', '180001', '190001', '200001', '210001', '220001', '230001']
suffix = '.gz'

files = [];

nlabels = open('enwiki-20090306-scc.pages', 'rU').readlines()
print len(nlabels)


node_labels = {}
nids = {}
node_id = 1
for label in nlabels:
    node_labels[label.replace('\n','')] = node_id
    nids[node_id] = label.replace('\n','');
    node_id = node_id + 1
    
    

dir = 'pageviews_tmp/'
os.mkdir(dir)

for d in range(len(day)):
    for h in range(len(hour)):
        
        print h

        filename = prefix + day[d] + '-' + hour[h]
        path = url + filename + suffix
        print path
        handle = urllib.urlopen(path)
        
        
        with open(dir + filename + suffix, 'wb') as out:
            while True:
                data = handle.read(1024)
                if len(data) == 0: break
                out.write(data)
                
        size = os.path.getsize(dir + filename + suffix)
        print size
        
        #50MB file --> 51,349,937
        if size < 1000000:
            #failed to download file, try pertubating the hour/min
            os.remove(dir + filename + suffix)
            filename = prefix + day[d] + '-' + hour_fix[h]
            path = url + filename + suffix
            print path
            handle = urllib.urlopen(path)
            
            
            with open(dir + filename + suffix, 'wb') as out:
                while True:
                    data = handle.read(1024)
                    if len(data) == 0: break
                    out.write(data)
            
        files.append(dir + filename);    
        
        # Extract SEED database
        handle = gzip.open(dir + filename + suffix)
        #gzip.GzipFile(fileobj=handle)
        with open(dir + filename, 'w') as out:
            for line in handle:
                out.write(line)
        
        
        page_counts = {}
        pageviews = open(dir + filename, 'rU')
        while 1:
            line = pageviews.readline()
            if not line:
                break
            pass #process line
            page = line.split(' ')
            
            project = page[0]
            
            if project.find('en') != -1:
                label = page[1].replace('_', ' ')
                count = page[2]
                
                if node_labels.get(label, "empty") != 'empty':
                    page_counts[label] = count
                
        print 'pages with counts: ' + str(len(page_counts.keys()))
        
        file = open(dir + filename + '.views', 'w')    
        for id in nids:
            if page_counts.get(nids[id], "empty") != "empty":
                file.write(str(page_counts[nids[id]]) + '\n')
            else:
                file.write(str(0) + '\n')
        file.close()

        os.remove(dir + filename)
        os.remove(dir + filename + suffix)
        
        
        pass