Wikipedia:List of Wikipedians by article count/Configuration

From Wikipedia, the free encyclopedia

articlecount.py[edit]

#! /usr/bin/env python
# Public domain; MZMcBride; 2015

import re

import oursql
import wikitools

import settings

def thousands(x): # From http://code.activestate.com/recipes/498181/
    return re.sub(r'(\d{3})(?=\d)', r'\1,', str(x)[::-1])[::-1]

def get_user_list(cursor, page):
    users = []
    cursor.execute('''
                   /* articlecount.py */
                   SELECT DISTINCT
                     pl_title
                   FROM page
                   JOIN pagelinks
                   ON pl_from = page_id
                   WHERE page_title = ?
                   AND page_namespace = 4
                   AND pl_namespace IN (2,3);
                   ''', (page,))
    for row in cursor.fetchall():
        user_name = u'%s' % unicode(row[0].replace('_', ' '), 'utf-8')
        users.append(user_name)
    return users

base_page = 'Wikipedia:List of Wikipedians by article count/'
ranges = [u'1\u20131000',
          u'1001\u20132000',
          u'2001\u20133000',
          u'3001\u20134000',
          u'4001\u20135000',
          u'5001\u20136000',
          u'6001\u20137000',
          u'7001\u20138000',
          u'8001\u20139000',
          u'9001\u201310000']

report_template = u'''\
=== %s ===
{| class="wikitable sortable"
|- style="white-space:nowrap;"
! No.
! User
! Non-redirects
! Redirects
! Total
|-
%s
|}
'''

row_template = u'''\
| %d
| %s
| %s
| %s
| %s
|-'''

wiki = wikitools.Wiki()
wiki.login(settings.username, settings.password)

conn = oursql.connect(host=settings.host,
                      db=settings.dbname,
                      read_default_file='~/.my.cnf')
cursor = conn.cursor()

opt_out = get_user_list(cursor, 'List_of_Wikipedians_by_number_of_edits/Anonymous')

cursor.execute('''
/* articlecount.py */
SELECT
  rev_user_text,
  SUM(CASE WHEN p1.page_is_redirect = 0 THEN 1 ELSE 0 END) AS notredir,
  SUM(CASE WHEN p1.page_is_redirect = 1 THEN 1 ELSE 0 END) AS redir,
  SUM(CASE WHEN p1.page_is_redirect = 0 THEN 1 ELSE 0 END) + 
  SUM(CASE WHEN p1.page_is_redirect = 1 THEN 1 ELSE 0 END) AS total
FROM revision_userindex
JOIN s51334__enwiki_first_page_revisions_p.page AS p2
ON rev_id = p2.page_first_revision_id
JOIN page AS p1
ON p1.page_id = p2.page_id
GROUP BY rev_user_text
ORDER BY total DESC
LIMIT 10000;
''')

i = 1
output = []
for row in cursor.fetchall():
    user_name = u'%s' % unicode(row[0], 'utf-8')
    if user_name in opt_out:
        user_name = '[Placeholder]'
    else:
        user_name = u'[[User:%s|%s]]' % (user_name, user_name)
    non_redirect_count = thousands(row[1])
    redirect_count = thousands(row[2])
    total = thousands(row[3])
    table_row = row_template % (i, user_name, non_redirect_count, redirect_count, total)
    output.append(table_row)
    i += 1

start = 0
end = 1000
per_page = 1000
for range in ranges:
    report = wikitools.Page(wiki, base_page+range.encode('utf-8'))
    report_text = report_template % (range, '\n'.join(output[start:end]))
    report_text = report_text.encode('utf-8')
    report.edit(report_text, summary=settings.editsumm, bot=1)
    start += per_page
    end += per_page

cursor.close()
conn.close()

update-revisions-database.py[edit]

#! /usr/bin/env python
# Public domain; MZMcBride; 2015

import oursql

'''
CREATE TABLE page (
    page_id int unsigned NOT NULL PRIMARY KEY,
    page_first_revision_id int unsigned NOT NULL
);
CREATE INDEX page_first_revision_id ON page (page_first_revision_id);
'''

enwiki_conn = oursql.connect(
    host='enwiki.labsdb',
    db='enwiki_p',
    read_default_file='~/.my.cnf'
)
enwiki_cursor = enwiki_conn.cursor()
revisions_conn = oursql.connect(
    host='enwiki.labsdb',
    db='s51334__enwiki_first_page_revisions_p',
    read_default_file='~/.my.cnf'
)
revisions_cursor = revisions_conn.cursor()

def get_pages(cursor, namespace=None, offset=0):
    pages = set()
    if namespace is not None:
        cursor.execute('''
            SELECT
                page_id
            FROM page
            WHERE page_id > ?
            AND page_namespace = ?
            ORDER BY page_id ASC
            LIMIT 100000
        ''', (offset, namespace))
    else:
        cursor.execute('''
            SELECT
                page_id
            FROM page
            WHERE page_id > ?
            ORDER BY page_id ASC
            LIMIT 100000
        ''', (offset,))
    for row in cursor.fetchall():
        pages.add(int(row[0]))
    return pages

def get_first_revision(cursor, page_id):
    cursor.execute('''
        SELECT
            rev_id
        FROM revision_userindex
        WHERE rev_page = ?
        ORDER BY rev_timestamp ASC
        LIMIT 1
    ''', (page_id,))
    return int(cursor.fetchall()[0][0])

def insert_first_revision(cursor, page_id, revision_id):
    cursor.execute('''
        INSERT INTO
            page
        SET
            page_id = ?,
            page_first_revision_id = ?
        ON DUPLICATE KEY UPDATE
            page_id = ?,
            page_first_revision_id = ?
    ''', (page_id, revision_id, page_id, revision_id))
    return

def delete_dead_page(cursor, page_id):
    cursor.execute('''
        DELETE FROM
            page
        WHERE page_id = ?
    ''', (page_id,))
    return

offset = 0
while True:
    source_pages = get_pages(enwiki_cursor, 0, offset)
    target_pages = get_pages(revisions_cursor, None, offset)
    if not source_pages:
        break
    for page in source_pages.difference(target_pages):
        try:
            first_revision = get_first_revision(enwiki_cursor, page)
        except:  # Temp. hack due to https://phabricator.wikimedia.org/T118095
            print(page)
            first_revision = None
        if first_revision is not None:
            insert_first_revision(revisions_cursor, page, first_revision)
    for page in target_pages.difference(source_pages):
        delete_dead_page(revisions_cursor, page)
    offset = max(source_pages)

enwiki_cursor.close()
enwiki_conn.close()
revisions_cursor.close()
revisions_conn.close()

verify-article-counts.py[edit]

#! /usr/bin/env python
# Public domain; MZMcBride; 2015

import sys
try:
    target_user = sys.argv[1]
except IndexError:
    target_user = 'Lugnuts'

import oursql

import settings

conn = oursql.connect(host=settings.host,
                      db=settings.dbname,
                      read_default_file='~/.my.cnf')
cursor = conn.cursor()

cursor.execute('''
/* verify-article-counts.py */
SELECT
  rev_user_text,
  SUM(CASE WHEN p1.page_is_redirect = 0 THEN 1 ELSE 0 END) AS notredir,
  SUM(CASE WHEN p1.page_is_redirect = 1 THEN 1 ELSE 0 END) AS redir,
  SUM(CASE WHEN p1.page_is_redirect = 0 THEN 1 ELSE 0 END) + 
  SUM(CASE WHEN p1.page_is_redirect = 1 THEN 1 ELSE 0 END) AS total
FROM revision_userindex
JOIN s51334__enwiki_first_page_revisions_p.page AS p2
ON rev_id = p2.page_first_revision_id
AND rev_user_text = ?
JOIN page AS p1
ON p1.page_id = p2.page_id
GROUP BY rev_user_text;
''', (target_user,))

cached_count = cursor.fetchall()
for i in cached_count:
    user_name = i[0]
    counts = [
        str(i[1]),
        str(i[2]),
        str(i[3])
    ]

print('Cached counts for {0}:   {1}'.format(
    user_name,
    '   '.join(counts)
))

cursor.execute('''
/* verify-article-counts.py */
SELECT
  r1.rev_user_text,
  SUM(CASE WHEN p1.page_is_redirect = 0 THEN 1 ELSE 0 END) AS notredir,
  SUM(CASE WHEN p1.page_is_redirect = 1 THEN 1 ELSE 0 END) AS redir,
  SUM(CASE WHEN p1.page_is_redirect = 0 THEN 1 ELSE 0 END) + 
  SUM(CASE WHEN p1.page_is_redirect = 1 THEN 1 ELSE 0 END) AS total
FROM revision_userindex AS r1
JOIN page AS p1
ON r1.rev_page = p1.page_id
WHERE r1.rev_user_text = ?
AND p1.page_namespace = 0
AND r1.rev_timestamp = (SELECT
                          r2.rev_timestamp
                        FROM revision_userindex AS r2
                        JOIN page AS p2
                        ON p2.page_id = r2.rev_page
                        WHERE r2.rev_page = r1.rev_page
                        ORDER BY r2.rev_timestamp ASC
                        LIMIT 1)
GROUP BY r1.rev_user_text;
''', (target_user,))

calced_count = cursor.fetchall()
for i in calced_count:
    user_name = i[0]
    counts = [
        str(i[1]),
        str(i[2]),
        str(i[3])
    ]

print('Calced counts for {0}:   {1}'.format(
    user_name,
    '   '.join(counts)
))

cursor.close()
conn.close()

crontab[edit]

10 5 * * * /usr/bin/jsub -N tools.mzmcbride-2 -stderr -once -o $HOME/logs -quiet python $HOME/scripts/enwiki/update-revisions-database.py
15 6 * * * /usr/bin/jsub -N tools.mzmcbride-3 -stderr -once -o $HOME/logs -quiet python $HOME/scripts/enwiki/articlecount.py