Wikipedia Quality Pageviews Correlation Coefficient

From mingus
Jump to: navigation, search
# Compute the correlation between quality and pageviews in Wikipedia
# Copyright (C) 2010 Brian Mingus

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

from urllib import FancyURLopener
from sys import argv
from scipy import array, corrcoef

featured_samples = 2000 # can be as large as number of featured articles but no attempt is made to check for dupes
random_samples = 2000  # shouldn't be larger than number of featured really

class Opener(FancyURLopener):
    version = "Liam Wyatte/Python" #useragent

opener = Opener()

featured_url = ''
random_url = ''
pageviews_url = ''

featured, random = [], []

def GetTitle(html):
    return html.split('<title>')[1].split('</title>')[0].split('- ')[0].strip()

def GetViews(html):
    return html.split('viewed')[1].split('times')[0].strip()

for feat in range(featured_samples):
    title = GetTitle(
    print "Featured:", title

for rand in range(random_samples):
    title = GetTitle(
    print "Random:", title

featured_pageviews, random_pageviews = [], []

for feat in featured:
    pageviews = GetViews( + feat).read())
    print 'Featured:', feat, 'has', pageviews, 'pageviews'

for rand in random:
    rand = rand.strip().replace(' ','_')
    pageviews = GetViews( + rand).read())

    print 'Random:', rand, 'has', pageviews, 'pageviews'

featured_quality = [1]*featured_samples # featured quality = 1
random_quality = [0]*random_samples # random quality = 0

pageviews = array(featured_pageviews + random_pageviews)
quality = array(featured_quality + random_quality)

print "Quality/Pageviews correlation coefficient", corrcoef(pageviews, quality)[1][0]