Wikipedia Quality Pageviews Correlation Coefficient

From mingus
Jump to: navigation, search
#!/usr/bin/python
# Compute the correlation between quality and pageviews in Wikipedia
# Copyright (C) 2010 Brian Mingus

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

from urllib import FancyURLopener
from sys import argv
from scipy import array, corrcoef

featured_samples = 2000 # can be as large as number of featured articles but no attempt is made to check for dupes
random_samples = 2000  # shouldn't be larger than number of featured really

class Opener(FancyURLopener):
    version = "Liam Wyatte/Python" #useragent

opener = Opener()

featured_url = 'http://toolserver.org/~erwin85/randomarticle.php?lang=en&family=wikipedia&categories=Featured+articles'
random_url = 'http://en.wikipedia.org/wiki/Special:Random'
pageviews_url = 'http://stats.grok.se/en/201004/'

featured, random = [], []

def GetTitle(html):
    return html.split('<title>')[1].split('</title>')[0].split('- ')[0].strip()

def GetViews(html):
    return html.split('viewed')[1].split('times')[0].strip()

for feat in range(featured_samples):
    title = GetTitle(opener.open(featured_url).read())
    print "Featured:", title
    featured.append(title)

for rand in range(random_samples):
    title = GetTitle(opener.open(random_url).read())
    print "Random:", title
    random.append(title)

featured_pageviews, random_pageviews = [], []

for feat in featured:
    pageviews = GetViews(opener.open(pageviews_url + feat).read())
    print 'Featured:', feat, 'has', pageviews, 'pageviews'
    featured_pageviews.append(pageviews)

for rand in random:
    rand = rand.strip().replace(' ','_')
    pageviews = GetViews(opener.open(pageviews_url + rand).read())

    print 'Random:', rand, 'has', pageviews, 'pageviews'
    random_pageviews.append(pageviews)

featured_quality = [1]*featured_samples # featured quality = 1
random_quality = [0]*random_samples # random quality = 0

pageviews = array(featured_pageviews + random_pageviews)
quality = array(featured_quality + random_quality)

print "Quality/Pageviews correlation coefficient", corrcoef(pageviews, quality)[1][0]